From 19fcec84d8d7d21e796c7624e521b60d28ee21ed Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 20:45:59 +0200 Subject: Adding upstream version 16.2.11+ds. Signed-off-by: Daniel Baumann --- src/rocksdb/db/arena_wrapped_db_iter.cc | 106 + src/rocksdb/db/arena_wrapped_db_iter.h | 112 + src/rocksdb/db/blob_index.h | 179 + src/rocksdb/db/builder.cc | 263 + src/rocksdb/db/builder.h | 88 + src/rocksdb/db/c.cc | 4451 +++++++++++++ src/rocksdb/db/c_test.c | 1866 ++++++ src/rocksdb/db/column_family.cc | 1523 +++++ src/rocksdb/db/column_family.h | 757 +++ src/rocksdb/db/column_family_test.cc | 3387 ++++++++++ src/rocksdb/db/compact_files_test.cc | 421 ++ src/rocksdb/db/compacted_db_impl.cc | 160 + src/rocksdb/db/compacted_db_impl.h | 113 + src/rocksdb/db/compaction/compaction.cc | 564 ++ src/rocksdb/db/compaction/compaction.h | 384 ++ .../db/compaction/compaction_iteration_stats.h | 37 + src/rocksdb/db/compaction/compaction_iterator.cc | 774 +++ src/rocksdb/db/compaction/compaction_iterator.h | 240 + .../db/compaction/compaction_iterator_test.cc | 976 +++ src/rocksdb/db/compaction/compaction_job.cc | 1700 +++++ src/rocksdb/db/compaction/compaction_job.h | 198 + .../db/compaction/compaction_job_stats_test.cc | 1043 ++++ src/rocksdb/db/compaction/compaction_job_test.cc | 1082 ++++ src/rocksdb/db/compaction/compaction_picker.cc | 1131 ++++ src/rocksdb/db/compaction/compaction_picker.h | 313 + .../db/compaction/compaction_picker_fifo.cc | 244 + src/rocksdb/db/compaction/compaction_picker_fifo.h | 53 + .../db/compaction/compaction_picker_level.cc | 558 ++ .../db/compaction/compaction_picker_level.h | 32 + .../db/compaction/compaction_picker_test.cc | 1741 ++++++ .../db/compaction/compaction_picker_universal.cc | 1105 ++++ .../db/compaction/compaction_picker_universal.h | 31 + src/rocksdb/db/comparator_db_test.cc | 660 ++ src/rocksdb/db/convenience.cc | 77 + src/rocksdb/db/corruption_test.cc | 613 ++ src/rocksdb/db/cuckoo_table_db_test.cc | 351 ++ src/rocksdb/db/db_basic_test.cc | 2545 ++++++++ src/rocksdb/db/db_blob_index_test.cc | 436 ++ src/rocksdb/db/db_block_cache_test.cc | 761 +++ src/rocksdb/db/db_bloom_filter_test.cc | 1910 ++++++ src/rocksdb/db/db_compaction_filter_test.cc | 872 +++ src/rocksdb/db/db_compaction_test.cc | 5167 +++++++++++++++ src/rocksdb/db/db_dynamic_level_test.cc | 505 ++ src/rocksdb/db/db_encryption_test.cc | 122 + src/rocksdb/db/db_filesnapshot.cc | 177 + src/rocksdb/db/db_flush_test.cc | 784 +++ src/rocksdb/db/db_impl/db_impl.cc | 4550 ++++++++++++++ src/rocksdb/db/db_impl/db_impl.h | 2107 +++++++ src/rocksdb/db/db_impl/db_impl_compaction_flush.cc | 3116 +++++++++ src/rocksdb/db/db_impl/db_impl_debug.cc | 294 + src/rocksdb/db/db_impl/db_impl_experimental.cc | 151 + src/rocksdb/db/db_impl/db_impl_files.cc | 667 ++ src/rocksdb/db/db_impl/db_impl_open.cc | 1651 +++++ src/rocksdb/db/db_impl/db_impl_readonly.cc | 221 + src/rocksdb/db/db_impl/db_impl_readonly.h | 137 + src/rocksdb/db/db_impl/db_impl_secondary.cc | 671 ++ src/rocksdb/db/db_impl/db_impl_secondary.h | 333 + src/rocksdb/db/db_impl/db_impl_write.cc | 1839 ++++++ src/rocksdb/db/db_impl/db_secondary_test.cc | 869 +++ src/rocksdb/db/db_info_dumper.cc | 123 + src/rocksdb/db/db_info_dumper.h | 14 + src/rocksdb/db/db_inplace_update_test.cc | 177 + src/rocksdb/db/db_io_failure_test.cc | 568 ++ src/rocksdb/db/db_iter.cc | 1310 ++++ src/rocksdb/db/db_iter.h | 344 + src/rocksdb/db/db_iter_stress_test.cc | 654 ++ src/rocksdb/db/db_iter_test.cc | 3175 ++++++++++ src/rocksdb/db/db_iterator_test.cc | 2998 +++++++++ src/rocksdb/db/db_log_iter_test.cc | 294 + src/rocksdb/db/db_memtable_test.cc | 340 + src/rocksdb/db/db_merge_operand_test.cc | 240 + src/rocksdb/db/db_merge_operator_test.cc | 666 ++ src/rocksdb/db/db_options_test.cc | 870 +++ src/rocksdb/db/db_properties_test.cc | 1711 +++++ src/rocksdb/db/db_range_del_test.cc | 1660 +++++ src/rocksdb/db/db_sst_test.cc | 1227 ++++ src/rocksdb/db/db_statistics_test.cc | 149 + src/rocksdb/db/db_table_properties_test.cc | 336 + src/rocksdb/db/db_tailing_iter_test.cc | 547 ++ src/rocksdb/db/db_test.cc | 6605 ++++++++++++++++++++ src/rocksdb/db/db_test2.cc | 4695 ++++++++++++++ src/rocksdb/db/db_test_util.cc | 1564 +++++ src/rocksdb/db/db_test_util.h | 1000 +++ src/rocksdb/db/db_universal_compaction_test.cc | 2254 +++++++ src/rocksdb/db/db_wal_test.cc | 1586 +++++ src/rocksdb/db/db_write_test.cc | 329 + src/rocksdb/db/dbformat.cc | 197 + src/rocksdb/db/dbformat.h | 671 ++ src/rocksdb/db/dbformat_test.cc | 207 + src/rocksdb/db/deletefile_test.cc | 571 ++ src/rocksdb/db/error_handler.cc | 344 + src/rocksdb/db/error_handler.h | 75 + src/rocksdb/db/error_handler_test.cc | 871 +++ src/rocksdb/db/event_helpers.cc | 223 + src/rocksdb/db/event_helpers.h | 55 + src/rocksdb/db/experimental.cc | 50 + src/rocksdb/db/external_sst_file_basic_test.cc | 1128 ++++ src/rocksdb/db/external_sst_file_ingestion_job.cc | 731 +++ src/rocksdb/db/external_sst_file_ingestion_job.h | 180 + src/rocksdb/db/external_sst_file_test.cc | 2832 +++++++++ src/rocksdb/db/fault_injection_test.cc | 555 ++ src/rocksdb/db/file_indexer.cc | 216 + src/rocksdb/db/file_indexer.h | 142 + src/rocksdb/db/file_indexer_test.cc | 350 ++ src/rocksdb/db/filename_test.cc | 180 + src/rocksdb/db/flush_job.cc | 466 ++ src/rocksdb/db/flush_job.h | 158 + src/rocksdb/db/flush_job_test.cc | 498 ++ src/rocksdb/db/flush_scheduler.cc | 86 + src/rocksdb/db/flush_scheduler.h | 54 + src/rocksdb/db/forward_iterator.cc | 975 +++ src/rocksdb/db/forward_iterator.h | 160 + src/rocksdb/db/forward_iterator_bench.cc | 377 ++ src/rocksdb/db/import_column_family_job.cc | 276 + src/rocksdb/db/import_column_family_job.h | 72 + src/rocksdb/db/import_column_family_test.cc | 567 ++ src/rocksdb/db/internal_stats.cc | 1424 +++++ src/rocksdb/db/internal_stats.h | 697 +++ src/rocksdb/db/job_context.h | 219 + src/rocksdb/db/listener_test.cc | 1042 +++ src/rocksdb/db/log_format.h | 48 + src/rocksdb/db/log_reader.cc | 624 ++ src/rocksdb/db/log_reader.h | 189 + src/rocksdb/db/log_test.cc | 928 +++ src/rocksdb/db/log_writer.cc | 162 + src/rocksdb/db/log_writer.h | 114 + src/rocksdb/db/logs_with_prep_tracker.cc | 67 + src/rocksdb/db/logs_with_prep_tracker.h | 63 + src/rocksdb/db/lookup_key.h | 66 + src/rocksdb/db/malloc_stats.cc | 54 + src/rocksdb/db/malloc_stats.h | 24 + src/rocksdb/db/manual_compaction_test.cc | 160 + src/rocksdb/db/memtable.cc | 1122 ++++ src/rocksdb/db/memtable.h | 542 ++ src/rocksdb/db/memtable_list.cc | 771 +++ src/rocksdb/db/memtable_list.h | 422 ++ src/rocksdb/db/memtable_list_test.cc | 922 +++ src/rocksdb/db/merge_context.h | 134 + src/rocksdb/db/merge_helper.cc | 417 ++ src/rocksdb/db/merge_helper.h | 194 + src/rocksdb/db/merge_helper_test.cc | 290 + src/rocksdb/db/merge_operator.cc | 86 + src/rocksdb/db/merge_test.cc | 504 ++ src/rocksdb/db/obsolete_files_test.cc | 222 + src/rocksdb/db/options_file_test.cc | 119 + src/rocksdb/db/perf_context_test.cc | 981 +++ src/rocksdb/db/pinned_iterators_manager.h | 87 + src/rocksdb/db/plain_table_db_test.cc | 1375 ++++ src/rocksdb/db/pre_release_callback.h | 38 + src/rocksdb/db/prefix_test.cc | 895 +++ src/rocksdb/db/range_del_aggregator.cc | 484 ++ src/rocksdb/db/range_del_aggregator.h | 441 ++ src/rocksdb/db/range_del_aggregator_bench.cc | 260 + src/rocksdb/db/range_del_aggregator_test.cc | 709 +++ src/rocksdb/db/range_tombstone_fragmenter.cc | 439 ++ src/rocksdb/db/range_tombstone_fragmenter.h | 256 + src/rocksdb/db/range_tombstone_fragmenter_test.cc | 552 ++ src/rocksdb/db/read_callback.h | 53 + src/rocksdb/db/repair.cc | 691 ++ src/rocksdb/db/repair_test.cc | 369 ++ src/rocksdb/db/snapshot_checker.h | 61 + src/rocksdb/db/snapshot_impl.cc | 26 + src/rocksdb/db/snapshot_impl.h | 167 + src/rocksdb/db/table_cache.cc | 668 ++ src/rocksdb/db/table_cache.h | 226 + src/rocksdb/db/table_properties_collector.cc | 74 + src/rocksdb/db/table_properties_collector.h | 107 + src/rocksdb/db/table_properties_collector_test.cc | 515 ++ src/rocksdb/db/transaction_log_impl.cc | 315 + src/rocksdb/db/transaction_log_impl.h | 127 + src/rocksdb/db/trim_history_scheduler.cc | 54 + src/rocksdb/db/trim_history_scheduler.h | 44 + src/rocksdb/db/version_builder.cc | 545 ++ src/rocksdb/db/version_builder.h | 48 + src/rocksdb/db/version_builder_test.cc | 349 ++ src/rocksdb/db/version_edit.cc | 826 +++ src/rocksdb/db/version_edit.h | 438 ++ src/rocksdb/db/version_edit_test.cc | 286 + src/rocksdb/db/version_set.cc | 6005 ++++++++++++++++++ src/rocksdb/db/version_set.h | 1251 ++++ src/rocksdb/db/version_set_test.cc | 1287 ++++ src/rocksdb/db/wal_manager.cc | 510 ++ src/rocksdb/db/wal_manager.h | 114 + src/rocksdb/db/wal_manager_test.cc | 338 + src/rocksdb/db/write_batch.cc | 2092 +++++++ src/rocksdb/db/write_batch_base.cc | 94 + src/rocksdb/db/write_batch_internal.h | 250 + src/rocksdb/db/write_batch_test.cc | 888 +++ src/rocksdb/db/write_callback.h | 27 + src/rocksdb/db/write_callback_test.cc | 452 ++ src/rocksdb/db/write_controller.cc | 128 + src/rocksdb/db/write_controller.h | 144 + src/rocksdb/db/write_controller_test.cc | 135 + src/rocksdb/db/write_thread.cc | 777 +++ src/rocksdb/db/write_thread.h | 431 ++ 195 files changed, 144976 insertions(+) create mode 100644 src/rocksdb/db/arena_wrapped_db_iter.cc create mode 100644 src/rocksdb/db/arena_wrapped_db_iter.h create mode 100644 src/rocksdb/db/blob_index.h create mode 100644 src/rocksdb/db/builder.cc create mode 100644 src/rocksdb/db/builder.h create mode 100644 src/rocksdb/db/c.cc create mode 100644 src/rocksdb/db/c_test.c create mode 100644 src/rocksdb/db/column_family.cc create mode 100644 src/rocksdb/db/column_family.h create mode 100644 src/rocksdb/db/column_family_test.cc create mode 100644 src/rocksdb/db/compact_files_test.cc create mode 100644 src/rocksdb/db/compacted_db_impl.cc create mode 100644 src/rocksdb/db/compacted_db_impl.h create mode 100644 src/rocksdb/db/compaction/compaction.cc create mode 100644 src/rocksdb/db/compaction/compaction.h create mode 100644 src/rocksdb/db/compaction/compaction_iteration_stats.h create mode 100644 src/rocksdb/db/compaction/compaction_iterator.cc create mode 100644 src/rocksdb/db/compaction/compaction_iterator.h create mode 100644 src/rocksdb/db/compaction/compaction_iterator_test.cc create mode 100644 src/rocksdb/db/compaction/compaction_job.cc create mode 100644 src/rocksdb/db/compaction/compaction_job.h create mode 100644 src/rocksdb/db/compaction/compaction_job_stats_test.cc create mode 100644 src/rocksdb/db/compaction/compaction_job_test.cc create mode 100644 src/rocksdb/db/compaction/compaction_picker.cc create mode 100644 src/rocksdb/db/compaction/compaction_picker.h create mode 100644 src/rocksdb/db/compaction/compaction_picker_fifo.cc create mode 100644 src/rocksdb/db/compaction/compaction_picker_fifo.h create mode 100644 src/rocksdb/db/compaction/compaction_picker_level.cc create mode 100644 src/rocksdb/db/compaction/compaction_picker_level.h create mode 100644 src/rocksdb/db/compaction/compaction_picker_test.cc create mode 100644 src/rocksdb/db/compaction/compaction_picker_universal.cc create mode 100644 src/rocksdb/db/compaction/compaction_picker_universal.h create mode 100644 src/rocksdb/db/comparator_db_test.cc create mode 100644 src/rocksdb/db/convenience.cc create mode 100644 src/rocksdb/db/corruption_test.cc create mode 100644 src/rocksdb/db/cuckoo_table_db_test.cc create mode 100644 src/rocksdb/db/db_basic_test.cc create mode 100644 src/rocksdb/db/db_blob_index_test.cc create mode 100644 src/rocksdb/db/db_block_cache_test.cc create mode 100644 src/rocksdb/db/db_bloom_filter_test.cc create mode 100644 src/rocksdb/db/db_compaction_filter_test.cc create mode 100644 src/rocksdb/db/db_compaction_test.cc create mode 100644 src/rocksdb/db/db_dynamic_level_test.cc create mode 100644 src/rocksdb/db/db_encryption_test.cc create mode 100644 src/rocksdb/db/db_filesnapshot.cc create mode 100644 src/rocksdb/db/db_flush_test.cc create mode 100644 src/rocksdb/db/db_impl/db_impl.cc create mode 100644 src/rocksdb/db/db_impl/db_impl.h create mode 100644 src/rocksdb/db/db_impl/db_impl_compaction_flush.cc create mode 100644 src/rocksdb/db/db_impl/db_impl_debug.cc create mode 100644 src/rocksdb/db/db_impl/db_impl_experimental.cc create mode 100644 src/rocksdb/db/db_impl/db_impl_files.cc create mode 100644 src/rocksdb/db/db_impl/db_impl_open.cc create mode 100644 src/rocksdb/db/db_impl/db_impl_readonly.cc create mode 100644 src/rocksdb/db/db_impl/db_impl_readonly.h create mode 100644 src/rocksdb/db/db_impl/db_impl_secondary.cc create mode 100644 src/rocksdb/db/db_impl/db_impl_secondary.h create mode 100644 src/rocksdb/db/db_impl/db_impl_write.cc create mode 100644 src/rocksdb/db/db_impl/db_secondary_test.cc create mode 100644 src/rocksdb/db/db_info_dumper.cc create mode 100644 src/rocksdb/db/db_info_dumper.h create mode 100644 src/rocksdb/db/db_inplace_update_test.cc create mode 100644 src/rocksdb/db/db_io_failure_test.cc create mode 100644 src/rocksdb/db/db_iter.cc create mode 100644 src/rocksdb/db/db_iter.h create mode 100644 src/rocksdb/db/db_iter_stress_test.cc create mode 100644 src/rocksdb/db/db_iter_test.cc create mode 100644 src/rocksdb/db/db_iterator_test.cc create mode 100644 src/rocksdb/db/db_log_iter_test.cc create mode 100644 src/rocksdb/db/db_memtable_test.cc create mode 100644 src/rocksdb/db/db_merge_operand_test.cc create mode 100644 src/rocksdb/db/db_merge_operator_test.cc create mode 100644 src/rocksdb/db/db_options_test.cc create mode 100644 src/rocksdb/db/db_properties_test.cc create mode 100644 src/rocksdb/db/db_range_del_test.cc create mode 100644 src/rocksdb/db/db_sst_test.cc create mode 100644 src/rocksdb/db/db_statistics_test.cc create mode 100644 src/rocksdb/db/db_table_properties_test.cc create mode 100644 src/rocksdb/db/db_tailing_iter_test.cc create mode 100644 src/rocksdb/db/db_test.cc create mode 100644 src/rocksdb/db/db_test2.cc create mode 100644 src/rocksdb/db/db_test_util.cc create mode 100644 src/rocksdb/db/db_test_util.h create mode 100644 src/rocksdb/db/db_universal_compaction_test.cc create mode 100644 src/rocksdb/db/db_wal_test.cc create mode 100644 src/rocksdb/db/db_write_test.cc create mode 100644 src/rocksdb/db/dbformat.cc create mode 100644 src/rocksdb/db/dbformat.h create mode 100644 src/rocksdb/db/dbformat_test.cc create mode 100644 src/rocksdb/db/deletefile_test.cc create mode 100644 src/rocksdb/db/error_handler.cc create mode 100644 src/rocksdb/db/error_handler.h create mode 100644 src/rocksdb/db/error_handler_test.cc create mode 100644 src/rocksdb/db/event_helpers.cc create mode 100644 src/rocksdb/db/event_helpers.h create mode 100644 src/rocksdb/db/experimental.cc create mode 100644 src/rocksdb/db/external_sst_file_basic_test.cc create mode 100644 src/rocksdb/db/external_sst_file_ingestion_job.cc create mode 100644 src/rocksdb/db/external_sst_file_ingestion_job.h create mode 100644 src/rocksdb/db/external_sst_file_test.cc create mode 100644 src/rocksdb/db/fault_injection_test.cc create mode 100644 src/rocksdb/db/file_indexer.cc create mode 100644 src/rocksdb/db/file_indexer.h create mode 100644 src/rocksdb/db/file_indexer_test.cc create mode 100644 src/rocksdb/db/filename_test.cc create mode 100644 src/rocksdb/db/flush_job.cc create mode 100644 src/rocksdb/db/flush_job.h create mode 100644 src/rocksdb/db/flush_job_test.cc create mode 100644 src/rocksdb/db/flush_scheduler.cc create mode 100644 src/rocksdb/db/flush_scheduler.h create mode 100644 src/rocksdb/db/forward_iterator.cc create mode 100644 src/rocksdb/db/forward_iterator.h create mode 100644 src/rocksdb/db/forward_iterator_bench.cc create mode 100644 src/rocksdb/db/import_column_family_job.cc create mode 100644 src/rocksdb/db/import_column_family_job.h create mode 100644 src/rocksdb/db/import_column_family_test.cc create mode 100644 src/rocksdb/db/internal_stats.cc create mode 100644 src/rocksdb/db/internal_stats.h create mode 100644 src/rocksdb/db/job_context.h create mode 100644 src/rocksdb/db/listener_test.cc create mode 100644 src/rocksdb/db/log_format.h create mode 100644 src/rocksdb/db/log_reader.cc create mode 100644 src/rocksdb/db/log_reader.h create mode 100644 src/rocksdb/db/log_test.cc create mode 100644 src/rocksdb/db/log_writer.cc create mode 100644 src/rocksdb/db/log_writer.h create mode 100644 src/rocksdb/db/logs_with_prep_tracker.cc create mode 100644 src/rocksdb/db/logs_with_prep_tracker.h create mode 100644 src/rocksdb/db/lookup_key.h create mode 100644 src/rocksdb/db/malloc_stats.cc create mode 100644 src/rocksdb/db/malloc_stats.h create mode 100644 src/rocksdb/db/manual_compaction_test.cc create mode 100644 src/rocksdb/db/memtable.cc create mode 100644 src/rocksdb/db/memtable.h create mode 100644 src/rocksdb/db/memtable_list.cc create mode 100644 src/rocksdb/db/memtable_list.h create mode 100644 src/rocksdb/db/memtable_list_test.cc create mode 100644 src/rocksdb/db/merge_context.h create mode 100644 src/rocksdb/db/merge_helper.cc create mode 100644 src/rocksdb/db/merge_helper.h create mode 100644 src/rocksdb/db/merge_helper_test.cc create mode 100644 src/rocksdb/db/merge_operator.cc create mode 100644 src/rocksdb/db/merge_test.cc create mode 100644 src/rocksdb/db/obsolete_files_test.cc create mode 100644 src/rocksdb/db/options_file_test.cc create mode 100644 src/rocksdb/db/perf_context_test.cc create mode 100644 src/rocksdb/db/pinned_iterators_manager.h create mode 100644 src/rocksdb/db/plain_table_db_test.cc create mode 100644 src/rocksdb/db/pre_release_callback.h create mode 100644 src/rocksdb/db/prefix_test.cc create mode 100644 src/rocksdb/db/range_del_aggregator.cc create mode 100644 src/rocksdb/db/range_del_aggregator.h create mode 100644 src/rocksdb/db/range_del_aggregator_bench.cc create mode 100644 src/rocksdb/db/range_del_aggregator_test.cc create mode 100644 src/rocksdb/db/range_tombstone_fragmenter.cc create mode 100644 src/rocksdb/db/range_tombstone_fragmenter.h create mode 100644 src/rocksdb/db/range_tombstone_fragmenter_test.cc create mode 100644 src/rocksdb/db/read_callback.h create mode 100644 src/rocksdb/db/repair.cc create mode 100644 src/rocksdb/db/repair_test.cc create mode 100644 src/rocksdb/db/snapshot_checker.h create mode 100644 src/rocksdb/db/snapshot_impl.cc create mode 100644 src/rocksdb/db/snapshot_impl.h create mode 100644 src/rocksdb/db/table_cache.cc create mode 100644 src/rocksdb/db/table_cache.h create mode 100644 src/rocksdb/db/table_properties_collector.cc create mode 100644 src/rocksdb/db/table_properties_collector.h create mode 100644 src/rocksdb/db/table_properties_collector_test.cc create mode 100644 src/rocksdb/db/transaction_log_impl.cc create mode 100644 src/rocksdb/db/transaction_log_impl.h create mode 100644 src/rocksdb/db/trim_history_scheduler.cc create mode 100644 src/rocksdb/db/trim_history_scheduler.h create mode 100644 src/rocksdb/db/version_builder.cc create mode 100644 src/rocksdb/db/version_builder.h create mode 100644 src/rocksdb/db/version_builder_test.cc create mode 100644 src/rocksdb/db/version_edit.cc create mode 100644 src/rocksdb/db/version_edit.h create mode 100644 src/rocksdb/db/version_edit_test.cc create mode 100644 src/rocksdb/db/version_set.cc create mode 100644 src/rocksdb/db/version_set.h create mode 100644 src/rocksdb/db/version_set_test.cc create mode 100644 src/rocksdb/db/wal_manager.cc create mode 100644 src/rocksdb/db/wal_manager.h create mode 100644 src/rocksdb/db/wal_manager_test.cc create mode 100644 src/rocksdb/db/write_batch.cc create mode 100644 src/rocksdb/db/write_batch_base.cc create mode 100644 src/rocksdb/db/write_batch_internal.h create mode 100644 src/rocksdb/db/write_batch_test.cc create mode 100644 src/rocksdb/db/write_callback.h create mode 100644 src/rocksdb/db/write_callback_test.cc create mode 100644 src/rocksdb/db/write_controller.cc create mode 100644 src/rocksdb/db/write_controller.h create mode 100644 src/rocksdb/db/write_controller_test.cc create mode 100644 src/rocksdb/db/write_thread.cc create mode 100644 src/rocksdb/db/write_thread.h (limited to 'src/rocksdb/db') diff --git a/src/rocksdb/db/arena_wrapped_db_iter.cc b/src/rocksdb/db/arena_wrapped_db_iter.cc new file mode 100644 index 000000000..f43282a75 --- /dev/null +++ b/src/rocksdb/db/arena_wrapped_db_iter.cc @@ -0,0 +1,106 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/arena_wrapped_db_iter.h" +#include "memory/arena.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "table/internal_iterator.h" +#include "table/iterator_wrapper.h" +#include "util/user_comparator_wrapper.h" + +namespace ROCKSDB_NAMESPACE { + +Status ArenaWrappedDBIter::GetProperty(std::string prop_name, + std::string* prop) { + if (prop_name == "rocksdb.iterator.super-version-number") { + // First try to pass the value returned from inner iterator. + if (!db_iter_->GetProperty(prop_name, prop).ok()) { + *prop = ToString(sv_number_); + } + return Status::OK(); + } + return db_iter_->GetProperty(prop_name, prop); +} + +void ArenaWrappedDBIter::Init(Env* env, const ReadOptions& read_options, + const ImmutableCFOptions& cf_options, + const MutableCFOptions& mutable_cf_options, + const SequenceNumber& sequence, + uint64_t max_sequential_skip_in_iteration, + uint64_t version_number, + ReadCallback* read_callback, DBImpl* db_impl, + ColumnFamilyData* cfd, bool allow_blob, + bool allow_refresh) { + auto mem = arena_.AllocateAligned(sizeof(DBIter)); + db_iter_ = new (mem) DBIter(env, read_options, cf_options, mutable_cf_options, + cf_options.user_comparator, nullptr, sequence, + true, max_sequential_skip_in_iteration, + read_callback, db_impl, cfd, allow_blob); + sv_number_ = version_number; + allow_refresh_ = allow_refresh; +} + +Status ArenaWrappedDBIter::Refresh() { + if (cfd_ == nullptr || db_impl_ == nullptr || !allow_refresh_) { + return Status::NotSupported("Creating renew iterator is not allowed."); + } + assert(db_iter_ != nullptr); + // TODO(yiwu): For last_seq_same_as_publish_seq_==false, this is not the + // correct behavior. Will be corrected automatically when we take a snapshot + // here for the case of WritePreparedTxnDB. + SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber(); + uint64_t cur_sv_number = cfd_->GetSuperVersionNumber(); + if (sv_number_ != cur_sv_number) { + Env* env = db_iter_->env(); + db_iter_->~DBIter(); + arena_.~Arena(); + new (&arena_) Arena(); + + SuperVersion* sv = cfd_->GetReferencedSuperVersion(db_impl_); + if (read_callback_) { + read_callback_->Refresh(latest_seq); + } + Init(env, read_options_, *(cfd_->ioptions()), sv->mutable_cf_options, + latest_seq, sv->mutable_cf_options.max_sequential_skip_in_iterations, + cur_sv_number, read_callback_, db_impl_, cfd_, allow_blob_, + allow_refresh_); + + InternalIterator* internal_iter = db_impl_->NewInternalIterator( + read_options_, cfd_, sv, &arena_, db_iter_->GetRangeDelAggregator(), + latest_seq); + SetIterUnderDBIter(internal_iter); + } else { + db_iter_->set_sequence(latest_seq); + db_iter_->set_valid(false); + } + return Status::OK(); +} + +ArenaWrappedDBIter* NewArenaWrappedDbIterator( + Env* env, const ReadOptions& read_options, + const ImmutableCFOptions& cf_options, + const MutableCFOptions& mutable_cf_options, const SequenceNumber& sequence, + uint64_t max_sequential_skip_in_iterations, uint64_t version_number, + ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd, + bool allow_blob, bool allow_refresh) { + ArenaWrappedDBIter* iter = new ArenaWrappedDBIter(); + iter->Init(env, read_options, cf_options, mutable_cf_options, sequence, + max_sequential_skip_in_iterations, version_number, read_callback, + db_impl, cfd, allow_blob, allow_refresh); + if (db_impl != nullptr && cfd != nullptr && allow_refresh) { + iter->StoreRefreshInfo(read_options, db_impl, cfd, read_callback, + allow_blob); + } + + return iter; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/arena_wrapped_db_iter.h b/src/rocksdb/db/arena_wrapped_db_iter.h new file mode 100644 index 000000000..0c135f857 --- /dev/null +++ b/src/rocksdb/db/arena_wrapped_db_iter.h @@ -0,0 +1,112 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include "db/db_impl/db_impl.h" +#include "db/db_iter.h" +#include "db/dbformat.h" +#include "db/range_del_aggregator.h" +#include "memory/arena.h" +#include "options/cf_options.h" +#include "rocksdb/db.h" +#include "rocksdb/iterator.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +class Arena; + +// A wrapper iterator which wraps DB Iterator and the arena, with which the DB +// iterator is supposed to be allocated. This class is used as an entry point of +// a iterator hierarchy whose memory can be allocated inline. In that way, +// accessing the iterator tree can be more cache friendly. It is also faster +// to allocate. +// When using the class's Iterator interface, the behavior is exactly +// the same as the inner DBIter. +class ArenaWrappedDBIter : public Iterator { + public: + virtual ~ArenaWrappedDBIter() { db_iter_->~DBIter(); } + + // Get the arena to be used to allocate memory for DBIter to be wrapped, + // as well as child iterators in it. + virtual Arena* GetArena() { return &arena_; } + virtual ReadRangeDelAggregator* GetRangeDelAggregator() { + return db_iter_->GetRangeDelAggregator(); + } + + // Set the internal iterator wrapped inside the DB Iterator. Usually it is + // a merging iterator. + virtual void SetIterUnderDBIter(InternalIterator* iter) { + db_iter_->SetIter(iter); + } + + bool Valid() const override { return db_iter_->Valid(); } + void SeekToFirst() override { db_iter_->SeekToFirst(); } + void SeekToLast() override { db_iter_->SeekToLast(); } + void Seek(const Slice& target) override { db_iter_->Seek(target); } + void SeekForPrev(const Slice& target) override { + db_iter_->SeekForPrev(target); + } + void Next() override { db_iter_->Next(); } + void Prev() override { db_iter_->Prev(); } + Slice key() const override { return db_iter_->key(); } + Slice value() const override { return db_iter_->value(); } + Status status() const override { return db_iter_->status(); } + bool IsBlob() const { return db_iter_->IsBlob(); } + + Status GetProperty(std::string prop_name, std::string* prop) override; + + Status Refresh() override; + + void Init(Env* env, const ReadOptions& read_options, + const ImmutableCFOptions& cf_options, + const MutableCFOptions& mutable_cf_options, + const SequenceNumber& sequence, + uint64_t max_sequential_skip_in_iterations, uint64_t version_number, + ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd, + bool allow_blob, bool allow_refresh); + + // Store some parameters so we can refresh the iterator at a later point + // with these same params + void StoreRefreshInfo(const ReadOptions& read_options, DBImpl* db_impl, + ColumnFamilyData* cfd, ReadCallback* read_callback, + bool allow_blob) { + read_options_ = read_options; + db_impl_ = db_impl; + cfd_ = cfd; + read_callback_ = read_callback; + allow_blob_ = allow_blob; + } + + private: + DBIter* db_iter_; + Arena arena_; + uint64_t sv_number_; + ColumnFamilyData* cfd_ = nullptr; + DBImpl* db_impl_ = nullptr; + ReadOptions read_options_; + ReadCallback* read_callback_; + bool allow_blob_ = false; + bool allow_refresh_ = true; +}; + +// Generate the arena wrapped iterator class. +// `db_impl` and `cfd` are used for reneweal. If left null, renewal will not +// be supported. +extern ArenaWrappedDBIter* NewArenaWrappedDbIterator( + Env* env, const ReadOptions& read_options, + const ImmutableCFOptions& cf_options, + const MutableCFOptions& mutable_cf_options, const SequenceNumber& sequence, + uint64_t max_sequential_skip_in_iterations, uint64_t version_number, + ReadCallback* read_callback, DBImpl* db_impl = nullptr, + ColumnFamilyData* cfd = nullptr, bool allow_blob = false, + bool allow_refresh = true); +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob_index.h b/src/rocksdb/db/blob_index.h new file mode 100644 index 000000000..483a7b97b --- /dev/null +++ b/src/rocksdb/db/blob_index.h @@ -0,0 +1,179 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once +#ifndef ROCKSDB_LITE + +#include +#include + +#include "rocksdb/options.h" +#include "util/coding.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +// BlobIndex is a pointer to the blob and metadata of the blob. The index is +// stored in base DB as ValueType::kTypeBlobIndex. +// There are three types of blob index: +// +// kInlinedTTL: +// +------+------------+---------------+ +// | type | expiration | value | +// +------+------------+---------------+ +// | char | varint64 | variable size | +// +------+------------+---------------+ +// +// kBlob: +// +------+-------------+----------+----------+-------------+ +// | type | file number | offset | size | compression | +// +------+-------------+----------+----------+-------------+ +// | char | varint64 | varint64 | varint64 | char | +// +------+-------------+----------+----------+-------------+ +// +// kBlobTTL: +// +------+------------+-------------+----------+----------+-------------+ +// | type | expiration | file number | offset | size | compression | +// +------+------------+-------------+----------+----------+-------------+ +// | char | varint64 | varint64 | varint64 | varint64 | char | +// +------+------------+-------------+----------+----------+-------------+ +// +// There isn't a kInlined (without TTL) type since we can store it as a plain +// value (i.e. ValueType::kTypeValue). +class BlobIndex { + public: + enum class Type : unsigned char { + kInlinedTTL = 0, + kBlob = 1, + kBlobTTL = 2, + kUnknown = 3, + }; + + BlobIndex() : type_(Type::kUnknown) {} + + bool IsInlined() const { return type_ == Type::kInlinedTTL; } + + bool HasTTL() const { + return type_ == Type::kInlinedTTL || type_ == Type::kBlobTTL; + } + + uint64_t expiration() const { + assert(HasTTL()); + return expiration_; + } + + const Slice& value() const { + assert(IsInlined()); + return value_; + } + + uint64_t file_number() const { + assert(!IsInlined()); + return file_number_; + } + + uint64_t offset() const { + assert(!IsInlined()); + return offset_; + } + + uint64_t size() const { + assert(!IsInlined()); + return size_; + } + + Status DecodeFrom(Slice slice) { + static const std::string kErrorMessage = "Error while decoding blob index"; + assert(slice.size() > 0); + type_ = static_cast(*slice.data()); + if (type_ >= Type::kUnknown) { + return Status::Corruption( + kErrorMessage, + "Unknown blob index type: " + ToString(static_cast(type_))); + } + slice = Slice(slice.data() + 1, slice.size() - 1); + if (HasTTL()) { + if (!GetVarint64(&slice, &expiration_)) { + return Status::Corruption(kErrorMessage, "Corrupted expiration"); + } + } + if (IsInlined()) { + value_ = slice; + } else { + if (GetVarint64(&slice, &file_number_) && GetVarint64(&slice, &offset_) && + GetVarint64(&slice, &size_) && slice.size() == 1) { + compression_ = static_cast(*slice.data()); + } else { + return Status::Corruption(kErrorMessage, "Corrupted blob offset"); + } + } + return Status::OK(); + } + + std::string DebugString(bool output_hex) const { + std::ostringstream oss; + + if (IsInlined()) { + oss << "[inlined blob] value:" << value_.ToString(output_hex); + } else { + oss << "[blob ref] file:" << file_number_ << " offset:" << offset_ + << " size:" << size_; + } + + if (HasTTL()) { + oss << " exp:" << expiration_; + } + + return oss.str(); + } + + static void EncodeInlinedTTL(std::string* dst, uint64_t expiration, + const Slice& value) { + assert(dst != nullptr); + dst->clear(); + dst->reserve(1 + kMaxVarint64Length + value.size()); + dst->push_back(static_cast(Type::kInlinedTTL)); + PutVarint64(dst, expiration); + dst->append(value.data(), value.size()); + } + + static void EncodeBlob(std::string* dst, uint64_t file_number, + uint64_t offset, uint64_t size, + CompressionType compression) { + assert(dst != nullptr); + dst->clear(); + dst->reserve(kMaxVarint64Length * 3 + 2); + dst->push_back(static_cast(Type::kBlob)); + PutVarint64(dst, file_number); + PutVarint64(dst, offset); + PutVarint64(dst, size); + dst->push_back(static_cast(compression)); + } + + static void EncodeBlobTTL(std::string* dst, uint64_t expiration, + uint64_t file_number, uint64_t offset, + uint64_t size, CompressionType compression) { + assert(dst != nullptr); + dst->clear(); + dst->reserve(kMaxVarint64Length * 4 + 2); + dst->push_back(static_cast(Type::kBlobTTL)); + PutVarint64(dst, expiration); + PutVarint64(dst, file_number); + PutVarint64(dst, offset); + PutVarint64(dst, size); + dst->push_back(static_cast(compression)); + } + + private: + Type type_ = Type::kUnknown; + uint64_t expiration_ = 0; + Slice value_; + uint64_t file_number_ = 0; + uint64_t offset_ = 0; + uint64_t size_ = 0; + CompressionType compression_ = kNoCompression; +}; + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/db/builder.cc b/src/rocksdb/db/builder.cc new file mode 100644 index 000000000..fdb814cbb --- /dev/null +++ b/src/rocksdb/db/builder.cc @@ -0,0 +1,263 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/builder.h" + +#include +#include +#include + +#include "db/compaction/compaction_iterator.h" +#include "db/dbformat.h" +#include "db/event_helpers.h" +#include "db/internal_stats.h" +#include "db/merge_helper.h" +#include "db/range_del_aggregator.h" +#include "db/table_cache.h" +#include "db/version_edit.h" +#include "file/filename.h" +#include "file/read_write_util.h" +#include "file/writable_file_writer.h" +#include "monitoring/iostats_context_imp.h" +#include "monitoring/thread_status_util.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" +#include "table/block_based/block_based_table_builder.h" +#include "table/format.h" +#include "table/internal_iterator.h" +#include "test_util/sync_point.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { + +class TableFactory; + +TableBuilder* NewTableBuilder( + const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions, + const InternalKeyComparator& internal_comparator, + const std::vector>* + int_tbl_prop_collector_factories, + uint32_t column_family_id, const std::string& column_family_name, + WritableFileWriter* file, const CompressionType compression_type, + uint64_t sample_for_compression, const CompressionOptions& compression_opts, + int level, const bool skip_filters, const uint64_t creation_time, + const uint64_t oldest_key_time, const uint64_t target_file_size, + const uint64_t file_creation_time) { + assert((column_family_id == + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) == + column_family_name.empty()); + return ioptions.table_factory->NewTableBuilder( + TableBuilderOptions(ioptions, moptions, internal_comparator, + int_tbl_prop_collector_factories, compression_type, + sample_for_compression, compression_opts, + skip_filters, column_family_name, level, + creation_time, oldest_key_time, target_file_size, + file_creation_time), + column_family_id, file); +} + +Status BuildTable( + const std::string& dbname, Env* env, FileSystem* fs, + const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options, const FileOptions& file_options, + TableCache* table_cache, InternalIterator* iter, + std::vector> + range_del_iters, + FileMetaData* meta, const InternalKeyComparator& internal_comparator, + const std::vector>* + int_tbl_prop_collector_factories, + uint32_t column_family_id, const std::string& column_family_name, + std::vector snapshots, + SequenceNumber earliest_write_conflict_snapshot, + SnapshotChecker* snapshot_checker, const CompressionType compression, + uint64_t sample_for_compression, const CompressionOptions& compression_opts, + bool paranoid_file_checks, InternalStats* internal_stats, + TableFileCreationReason reason, EventLogger* event_logger, int job_id, + const Env::IOPriority io_priority, TableProperties* table_properties, + int level, const uint64_t creation_time, const uint64_t oldest_key_time, + Env::WriteLifeTimeHint write_hint, const uint64_t file_creation_time) { + assert((column_family_id == + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) == + column_family_name.empty()); + // Reports the IOStats for flush for every following bytes. + const size_t kReportFlushIOStatsEvery = 1048576; + Status s; + meta->fd.file_size = 0; + iter->SeekToFirst(); + std::unique_ptr range_del_agg( + new CompactionRangeDelAggregator(&internal_comparator, snapshots)); + for (auto& range_del_iter : range_del_iters) { + range_del_agg->AddTombstones(std::move(range_del_iter)); + } + + std::string fname = TableFileName(ioptions.cf_paths, meta->fd.GetNumber(), + meta->fd.GetPathId()); +#ifndef ROCKSDB_LITE + EventHelpers::NotifyTableFileCreationStarted( + ioptions.listeners, dbname, column_family_name, fname, job_id, reason); +#endif // !ROCKSDB_LITE + TableProperties tp; + + if (iter->Valid() || !range_del_agg->IsEmpty()) { + TableBuilder* builder; + std::unique_ptr file_writer; + // Currently we only enable dictionary compression during compaction to the + // bottommost level. + CompressionOptions compression_opts_for_flush(compression_opts); + compression_opts_for_flush.max_dict_bytes = 0; + compression_opts_for_flush.zstd_max_train_bytes = 0; + { + std::unique_ptr file; +#ifndef NDEBUG + bool use_direct_writes = file_options.use_direct_writes; + TEST_SYNC_POINT_CALLBACK("BuildTable:create_file", &use_direct_writes); +#endif // !NDEBUG + s = NewWritableFile(fs, fname, &file, file_options); + if (!s.ok()) { + EventHelpers::LogAndNotifyTableFileCreationFinished( + event_logger, ioptions.listeners, dbname, column_family_name, fname, + job_id, meta->fd, kInvalidBlobFileNumber, tp, reason, s); + return s; + } + file->SetIOPriority(io_priority); + file->SetWriteLifeTimeHint(write_hint); + + file_writer.reset(new WritableFileWriter( + std::move(file), fname, file_options, env, ioptions.statistics, + ioptions.listeners, ioptions.sst_file_checksum_func)); + + builder = NewTableBuilder( + ioptions, mutable_cf_options, internal_comparator, + int_tbl_prop_collector_factories, column_family_id, + column_family_name, file_writer.get(), compression, + sample_for_compression, compression_opts_for_flush, level, + false /* skip_filters */, creation_time, oldest_key_time, + 0 /*target_file_size*/, file_creation_time); + } + + MergeHelper merge(env, internal_comparator.user_comparator(), + ioptions.merge_operator, nullptr, ioptions.info_log, + true /* internal key corruption is not ok */, + snapshots.empty() ? 0 : snapshots.back(), + snapshot_checker); + + CompactionIterator c_iter( + iter, internal_comparator.user_comparator(), &merge, kMaxSequenceNumber, + &snapshots, earliest_write_conflict_snapshot, snapshot_checker, env, + ShouldReportDetailedTime(env, ioptions.statistics), + true /* internal key corruption is not ok */, range_del_agg.get()); + c_iter.SeekToFirst(); + for (; c_iter.Valid(); c_iter.Next()) { + const Slice& key = c_iter.key(); + const Slice& value = c_iter.value(); + const ParsedInternalKey& ikey = c_iter.ikey(); + builder->Add(key, value); + meta->UpdateBoundaries(key, value, ikey.sequence, ikey.type); + + // TODO(noetzli): Update stats after flush, too. + if (io_priority == Env::IO_HIGH && + IOSTATS(bytes_written) >= kReportFlushIOStatsEvery) { + ThreadStatusUtil::SetThreadOperationProperty( + ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written)); + } + } + + auto range_del_it = range_del_agg->NewIterator(); + for (range_del_it->SeekToFirst(); range_del_it->Valid(); + range_del_it->Next()) { + auto tombstone = range_del_it->Tombstone(); + auto kv = tombstone.Serialize(); + builder->Add(kv.first.Encode(), kv.second); + meta->UpdateBoundariesForRange(kv.first, tombstone.SerializeEndKey(), + tombstone.seq_, internal_comparator); + } + + // Finish and check for builder errors + tp = builder->GetTableProperties(); + bool empty = builder->NumEntries() == 0 && tp.num_range_deletions == 0; + s = c_iter.status(); + if (!s.ok() || empty) { + builder->Abandon(); + } else { + s = builder->Finish(); + } + + if (s.ok() && !empty) { + uint64_t file_size = builder->FileSize(); + meta->fd.file_size = file_size; + meta->marked_for_compaction = builder->NeedCompact(); + assert(meta->fd.GetFileSize() > 0); + tp = builder->GetTableProperties(); // refresh now that builder is finished + if (table_properties) { + *table_properties = tp; + } + // Add the checksum information to file metadata. + meta->file_checksum = builder->GetFileChecksum(); + meta->file_checksum_func_name = builder->GetFileChecksumFuncName(); + } + delete builder; + + // Finish and check for file errors + if (s.ok() && !empty) { + StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS); + s = file_writer->Sync(ioptions.use_fsync); + } + if (s.ok() && !empty) { + s = file_writer->Close(); + } + + if (s.ok() && !empty) { + // Verify that the table is usable + // We set for_compaction to false and don't OptimizeForCompactionTableRead + // here because this is a special case after we finish the table building + // No matter whether use_direct_io_for_flush_and_compaction is true, + // we will regrad this verification as user reads since the goal is + // to cache it here for further user reads + std::unique_ptr it(table_cache->NewIterator( + ReadOptions(), file_options, internal_comparator, *meta, + nullptr /* range_del_agg */, + mutable_cf_options.prefix_extractor.get(), nullptr, + (internal_stats == nullptr) ? nullptr + : internal_stats->GetFileReadHist(0), + TableReaderCaller::kFlush, /*arena=*/nullptr, + /*skip_filter=*/false, level, /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key*/ nullptr)); + s = it->status(); + if (s.ok() && paranoid_file_checks) { + for (it->SeekToFirst(); it->Valid(); it->Next()) { + } + s = it->status(); + } + } + } + + // Check for input iterator errors + if (!iter->status().ok()) { + s = iter->status(); + } + + if (!s.ok() || meta->fd.GetFileSize() == 0) { + fs->DeleteFile(fname, IOOptions(), nullptr); + } + + if (meta->fd.GetFileSize() == 0) { + fname = "(nil)"; + } + // Output to event logger and fire events. + EventHelpers::LogAndNotifyTableFileCreationFinished( + event_logger, ioptions.listeners, dbname, column_family_name, fname, + job_id, meta->fd, meta->oldest_blob_file_number, tp, reason, s); + + return s; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/builder.h b/src/rocksdb/db/builder.h new file mode 100644 index 000000000..062f1fb80 --- /dev/null +++ b/src/rocksdb/db/builder.h @@ -0,0 +1,88 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once +#include +#include +#include +#include "db/range_tombstone_fragmenter.h" +#include "db/table_properties_collector.h" +#include "logging/event_logger.h" +#include "options/cf_options.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/listener.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" +#include "rocksdb/table_properties.h" +#include "rocksdb/types.h" +#include "table/scoped_arena_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +struct Options; +struct FileMetaData; + +class Env; +struct EnvOptions; +class Iterator; +class SnapshotChecker; +class TableCache; +class VersionEdit; +class TableBuilder; +class WritableFileWriter; +class InternalStats; + +// @param column_family_name Name of the column family that is also identified +// by column_family_id, or empty string if unknown. It must outlive the +// TableBuilder returned by this function. +TableBuilder* NewTableBuilder( + const ImmutableCFOptions& options, const MutableCFOptions& moptions, + const InternalKeyComparator& internal_comparator, + const std::vector>* + int_tbl_prop_collector_factories, + uint32_t column_family_id, const std::string& column_family_name, + WritableFileWriter* file, const CompressionType compression_type, + const uint64_t sample_for_compression, + const CompressionOptions& compression_opts, int level, + const bool skip_filters = false, const uint64_t creation_time = 0, + const uint64_t oldest_key_time = 0, const uint64_t target_file_size = 0, + const uint64_t file_creation_time = 0); + +// Build a Table file from the contents of *iter. The generated file +// will be named according to number specified in meta. On success, the rest of +// *meta will be filled with metadata about the generated table. +// If no data is present in *iter, meta->file_size will be set to +// zero, and no Table file will be produced. +// +// @param column_family_name Name of the column family that is also identified +// by column_family_id, or empty string if unknown. +extern Status BuildTable( + const std::string& dbname, Env* env, FileSystem* fs, + const ImmutableCFOptions& options, + const MutableCFOptions& mutable_cf_options, const FileOptions& file_options, + TableCache* table_cache, InternalIterator* iter, + std::vector> + range_del_iters, + FileMetaData* meta, const InternalKeyComparator& internal_comparator, + const std::vector>* + int_tbl_prop_collector_factories, + uint32_t column_family_id, const std::string& column_family_name, + std::vector snapshots, + SequenceNumber earliest_write_conflict_snapshot, + SnapshotChecker* snapshot_checker, const CompressionType compression, + const uint64_t sample_for_compression, + const CompressionOptions& compression_opts, bool paranoid_file_checks, + InternalStats* internal_stats, TableFileCreationReason reason, + EventLogger* event_logger = nullptr, int job_id = 0, + const Env::IOPriority io_priority = Env::IO_HIGH, + TableProperties* table_properties = nullptr, int level = -1, + const uint64_t creation_time = 0, const uint64_t oldest_key_time = 0, + Env::WriteLifeTimeHint write_hint = Env::WLTH_NOT_SET, + const uint64_t file_creation_time = 0); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/c.cc b/src/rocksdb/db/c.cc new file mode 100644 index 000000000..db78030df --- /dev/null +++ b/src/rocksdb/db/c.cc @@ -0,0 +1,4451 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef ROCKSDB_LITE + +#include "rocksdb/c.h" + +#include +#include "port/port.h" +#include "rocksdb/cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/comparator.h" +#include "rocksdb/convenience.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/iterator.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/options.h" +#include "rocksdb/rate_limiter.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "rocksdb/universal_compaction.h" +#include "rocksdb/utilities/backupable_db.h" +#include "rocksdb/utilities/checkpoint.h" +#include "rocksdb/utilities/db_ttl.h" +#include "rocksdb/utilities/memory_util.h" +#include "rocksdb/utilities/optimistic_transaction_db.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "rocksdb/write_batch.h" +#include "rocksdb/perf_context.h" +#include "utilities/merge_operators.h" + +#include +#include +#include + +using ROCKSDB_NAMESPACE::BackupableDBOptions; +using ROCKSDB_NAMESPACE::BackupEngine; +using ROCKSDB_NAMESPACE::BackupID; +using ROCKSDB_NAMESPACE::BackupInfo; +using ROCKSDB_NAMESPACE::BatchResult; +using ROCKSDB_NAMESPACE::BlockBasedTableOptions; +using ROCKSDB_NAMESPACE::BottommostLevelCompaction; +using ROCKSDB_NAMESPACE::BytewiseComparator; +using ROCKSDB_NAMESPACE::Cache; +using ROCKSDB_NAMESPACE::Checkpoint; +using ROCKSDB_NAMESPACE::ColumnFamilyDescriptor; +using ROCKSDB_NAMESPACE::ColumnFamilyHandle; +using ROCKSDB_NAMESPACE::ColumnFamilyOptions; +using ROCKSDB_NAMESPACE::CompactionFilter; +using ROCKSDB_NAMESPACE::CompactionFilterContext; +using ROCKSDB_NAMESPACE::CompactionFilterFactory; +using ROCKSDB_NAMESPACE::CompactionOptionsFIFO; +using ROCKSDB_NAMESPACE::CompactRangeOptions; +using ROCKSDB_NAMESPACE::Comparator; +using ROCKSDB_NAMESPACE::CompressionType; +using ROCKSDB_NAMESPACE::CuckooTableOptions; +using ROCKSDB_NAMESPACE::DB; +using ROCKSDB_NAMESPACE::DBOptions; +using ROCKSDB_NAMESPACE::DbPath; +using ROCKSDB_NAMESPACE::Env; +using ROCKSDB_NAMESPACE::EnvOptions; +using ROCKSDB_NAMESPACE::FileLock; +using ROCKSDB_NAMESPACE::FilterPolicy; +using ROCKSDB_NAMESPACE::FlushOptions; +using ROCKSDB_NAMESPACE::InfoLogLevel; +using ROCKSDB_NAMESPACE::IngestExternalFileOptions; +using ROCKSDB_NAMESPACE::Iterator; +using ROCKSDB_NAMESPACE::LiveFileMetaData; +using ROCKSDB_NAMESPACE::Logger; +using ROCKSDB_NAMESPACE::MemoryUtil; +using ROCKSDB_NAMESPACE::MergeOperator; +using ROCKSDB_NAMESPACE::MergeOperators; +using ROCKSDB_NAMESPACE::NewBloomFilterPolicy; +using ROCKSDB_NAMESPACE::NewGenericRateLimiter; +using ROCKSDB_NAMESPACE::NewLRUCache; +using ROCKSDB_NAMESPACE::OptimisticTransactionDB; +using ROCKSDB_NAMESPACE::OptimisticTransactionOptions; +using ROCKSDB_NAMESPACE::Options; +using ROCKSDB_NAMESPACE::PerfContext; +using ROCKSDB_NAMESPACE::PerfLevel; +using ROCKSDB_NAMESPACE::PinnableSlice; +using ROCKSDB_NAMESPACE::RandomAccessFile; +using ROCKSDB_NAMESPACE::Range; +using ROCKSDB_NAMESPACE::RateLimiter; +using ROCKSDB_NAMESPACE::ReadOptions; +using ROCKSDB_NAMESPACE::RestoreOptions; +using ROCKSDB_NAMESPACE::SequentialFile; +using ROCKSDB_NAMESPACE::Slice; +using ROCKSDB_NAMESPACE::SliceParts; +using ROCKSDB_NAMESPACE::SliceTransform; +using ROCKSDB_NAMESPACE::Snapshot; +using ROCKSDB_NAMESPACE::SstFileWriter; +using ROCKSDB_NAMESPACE::Status; +using ROCKSDB_NAMESPACE::Transaction; +using ROCKSDB_NAMESPACE::TransactionDB; +using ROCKSDB_NAMESPACE::TransactionDBOptions; +using ROCKSDB_NAMESPACE::TransactionLogIterator; +using ROCKSDB_NAMESPACE::TransactionOptions; +using ROCKSDB_NAMESPACE::WALRecoveryMode; +using ROCKSDB_NAMESPACE::WritableFile; +using ROCKSDB_NAMESPACE::WriteBatch; +using ROCKSDB_NAMESPACE::WriteBatchWithIndex; +using ROCKSDB_NAMESPACE::WriteOptions; + +using std::shared_ptr; +using std::vector; +using std::unordered_set; +using std::map; + +extern "C" { + +struct rocksdb_t { DB* rep; }; +struct rocksdb_backup_engine_t { BackupEngine* rep; }; +struct rocksdb_backup_engine_info_t { std::vector rep; }; +struct rocksdb_restore_options_t { RestoreOptions rep; }; +struct rocksdb_iterator_t { Iterator* rep; }; +struct rocksdb_writebatch_t { WriteBatch rep; }; +struct rocksdb_writebatch_wi_t { WriteBatchWithIndex* rep; }; +struct rocksdb_snapshot_t { const Snapshot* rep; }; +struct rocksdb_flushoptions_t { FlushOptions rep; }; +struct rocksdb_fifo_compaction_options_t { CompactionOptionsFIFO rep; }; +struct rocksdb_readoptions_t { + ReadOptions rep; + // stack variables to set pointers to in ReadOptions + Slice upper_bound; + Slice lower_bound; +}; +struct rocksdb_writeoptions_t { WriteOptions rep; }; +struct rocksdb_options_t { Options rep; }; +struct rocksdb_compactoptions_t { + CompactRangeOptions rep; +}; +struct rocksdb_block_based_table_options_t { BlockBasedTableOptions rep; }; +struct rocksdb_cuckoo_table_options_t { CuckooTableOptions rep; }; +struct rocksdb_seqfile_t { SequentialFile* rep; }; +struct rocksdb_randomfile_t { RandomAccessFile* rep; }; +struct rocksdb_writablefile_t { WritableFile* rep; }; +struct rocksdb_wal_iterator_t { TransactionLogIterator* rep; }; +struct rocksdb_wal_readoptions_t { TransactionLogIterator::ReadOptions rep; }; +struct rocksdb_filelock_t { FileLock* rep; }; +struct rocksdb_logger_t { + std::shared_ptr rep; +}; +struct rocksdb_cache_t { + std::shared_ptr rep; +}; +struct rocksdb_livefiles_t { std::vector rep; }; +struct rocksdb_column_family_handle_t { ColumnFamilyHandle* rep; }; +struct rocksdb_envoptions_t { EnvOptions rep; }; +struct rocksdb_ingestexternalfileoptions_t { IngestExternalFileOptions rep; }; +struct rocksdb_sstfilewriter_t { SstFileWriter* rep; }; +struct rocksdb_ratelimiter_t { + std::shared_ptr rep; +}; +struct rocksdb_perfcontext_t { PerfContext* rep; }; +struct rocksdb_pinnableslice_t { + PinnableSlice rep; +}; +struct rocksdb_transactiondb_options_t { + TransactionDBOptions rep; +}; +struct rocksdb_transactiondb_t { + TransactionDB* rep; +}; +struct rocksdb_transaction_options_t { + TransactionOptions rep; +}; +struct rocksdb_transaction_t { + Transaction* rep; +}; +struct rocksdb_checkpoint_t { + Checkpoint* rep; +}; +struct rocksdb_optimistictransactiondb_t { + OptimisticTransactionDB* rep; +}; +struct rocksdb_optimistictransaction_options_t { + OptimisticTransactionOptions rep; +}; + +struct rocksdb_compactionfiltercontext_t { + CompactionFilter::Context rep; +}; + +struct rocksdb_compactionfilter_t : public CompactionFilter { + void* state_; + void (*destructor_)(void*); + unsigned char (*filter_)( + void*, + int level, + const char* key, size_t key_length, + const char* existing_value, size_t value_length, + char** new_value, size_t *new_value_length, + unsigned char* value_changed); + const char* (*name_)(void*); + unsigned char ignore_snapshots_; + + ~rocksdb_compactionfilter_t() override { (*destructor_)(state_); } + + bool Filter(int level, const Slice& key, const Slice& existing_value, + std::string* new_value, bool* value_changed) const override { + char* c_new_value = nullptr; + size_t new_value_length = 0; + unsigned char c_value_changed = 0; + unsigned char result = (*filter_)( + state_, + level, + key.data(), key.size(), + existing_value.data(), existing_value.size(), + &c_new_value, &new_value_length, &c_value_changed); + if (c_value_changed) { + new_value->assign(c_new_value, new_value_length); + *value_changed = true; + } + return result; + } + + const char* Name() const override { return (*name_)(state_); } + + bool IgnoreSnapshots() const override { return ignore_snapshots_; } +}; + +struct rocksdb_compactionfilterfactory_t : public CompactionFilterFactory { + void* state_; + void (*destructor_)(void*); + rocksdb_compactionfilter_t* (*create_compaction_filter_)( + void*, rocksdb_compactionfiltercontext_t* context); + const char* (*name_)(void*); + + ~rocksdb_compactionfilterfactory_t() override { (*destructor_)(state_); } + + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { + rocksdb_compactionfiltercontext_t ccontext; + ccontext.rep = context; + CompactionFilter* cf = (*create_compaction_filter_)(state_, &ccontext); + return std::unique_ptr(cf); + } + + const char* Name() const override { return (*name_)(state_); } +}; + +struct rocksdb_comparator_t : public Comparator { + void* state_; + void (*destructor_)(void*); + int (*compare_)( + void*, + const char* a, size_t alen, + const char* b, size_t blen); + const char* (*name_)(void*); + + ~rocksdb_comparator_t() override { (*destructor_)(state_); } + + int Compare(const Slice& a, const Slice& b) const override { + return (*compare_)(state_, a.data(), a.size(), b.data(), b.size()); + } + + const char* Name() const override { return (*name_)(state_); } + + // No-ops since the C binding does not support key shortening methods. + void FindShortestSeparator(std::string*, const Slice&) const override {} + void FindShortSuccessor(std::string* /*key*/) const override {} +}; + +struct rocksdb_filterpolicy_t : public FilterPolicy { + void* state_; + void (*destructor_)(void*); + const char* (*name_)(void*); + char* (*create_)( + void*, + const char* const* key_array, const size_t* key_length_array, + int num_keys, + size_t* filter_length); + unsigned char (*key_match_)( + void*, + const char* key, size_t length, + const char* filter, size_t filter_length); + void (*delete_filter_)( + void*, + const char* filter, size_t filter_length); + + ~rocksdb_filterpolicy_t() override { (*destructor_)(state_); } + + const char* Name() const override { return (*name_)(state_); } + + void CreateFilter(const Slice* keys, int n, std::string* dst) const override { + std::vector key_pointers(n); + std::vector key_sizes(n); + for (int i = 0; i < n; i++) { + key_pointers[i] = keys[i].data(); + key_sizes[i] = keys[i].size(); + } + size_t len; + char* filter = (*create_)(state_, &key_pointers[0], &key_sizes[0], n, &len); + dst->append(filter, len); + + if (delete_filter_ != nullptr) { + (*delete_filter_)(state_, filter, len); + } else { + free(filter); + } + } + + bool KeyMayMatch(const Slice& key, const Slice& filter) const override { + return (*key_match_)(state_, key.data(), key.size(), + filter.data(), filter.size()); + } +}; + +struct rocksdb_mergeoperator_t : public MergeOperator { + void* state_; + void (*destructor_)(void*); + const char* (*name_)(void*); + char* (*full_merge_)( + void*, + const char* key, size_t key_length, + const char* existing_value, size_t existing_value_length, + const char* const* operands_list, const size_t* operands_list_length, + int num_operands, + unsigned char* success, size_t* new_value_length); + char* (*partial_merge_)(void*, const char* key, size_t key_length, + const char* const* operands_list, + const size_t* operands_list_length, int num_operands, + unsigned char* success, size_t* new_value_length); + void (*delete_value_)( + void*, + const char* value, size_t value_length); + + ~rocksdb_mergeoperator_t() override { (*destructor_)(state_); } + + const char* Name() const override { return (*name_)(state_); } + + bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override { + size_t n = merge_in.operand_list.size(); + std::vector operand_pointers(n); + std::vector operand_sizes(n); + for (size_t i = 0; i < n; i++) { + Slice operand(merge_in.operand_list[i]); + operand_pointers[i] = operand.data(); + operand_sizes[i] = operand.size(); + } + + const char* existing_value_data = nullptr; + size_t existing_value_len = 0; + if (merge_in.existing_value != nullptr) { + existing_value_data = merge_in.existing_value->data(); + existing_value_len = merge_in.existing_value->size(); + } + + unsigned char success; + size_t new_value_len; + char* tmp_new_value = (*full_merge_)( + state_, merge_in.key.data(), merge_in.key.size(), existing_value_data, + existing_value_len, &operand_pointers[0], &operand_sizes[0], + static_cast(n), &success, &new_value_len); + merge_out->new_value.assign(tmp_new_value, new_value_len); + + if (delete_value_ != nullptr) { + (*delete_value_)(state_, tmp_new_value, new_value_len); + } else { + free(tmp_new_value); + } + + return success; + } + + bool PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, + Logger* /*logger*/) const override { + size_t operand_count = operand_list.size(); + std::vector operand_pointers(operand_count); + std::vector operand_sizes(operand_count); + for (size_t i = 0; i < operand_count; ++i) { + Slice operand(operand_list[i]); + operand_pointers[i] = operand.data(); + operand_sizes[i] = operand.size(); + } + + unsigned char success; + size_t new_value_len; + char* tmp_new_value = (*partial_merge_)( + state_, key.data(), key.size(), &operand_pointers[0], &operand_sizes[0], + static_cast(operand_count), &success, &new_value_len); + new_value->assign(tmp_new_value, new_value_len); + + if (delete_value_ != nullptr) { + (*delete_value_)(state_, tmp_new_value, new_value_len); + } else { + free(tmp_new_value); + } + + return success; + } +}; + +struct rocksdb_dbpath_t { + DbPath rep; +}; + +struct rocksdb_env_t { + Env* rep; + bool is_default; +}; + +struct rocksdb_slicetransform_t : public SliceTransform { + void* state_; + void (*destructor_)(void*); + const char* (*name_)(void*); + char* (*transform_)( + void*, + const char* key, size_t length, + size_t* dst_length); + unsigned char (*in_domain_)( + void*, + const char* key, size_t length); + unsigned char (*in_range_)( + void*, + const char* key, size_t length); + + ~rocksdb_slicetransform_t() override { (*destructor_)(state_); } + + const char* Name() const override { return (*name_)(state_); } + + Slice Transform(const Slice& src) const override { + size_t len; + char* dst = (*transform_)(state_, src.data(), src.size(), &len); + return Slice(dst, len); + } + + bool InDomain(const Slice& src) const override { + return (*in_domain_)(state_, src.data(), src.size()); + } + + bool InRange(const Slice& src) const override { + return (*in_range_)(state_, src.data(), src.size()); + } +}; + +struct rocksdb_universal_compaction_options_t { + ROCKSDB_NAMESPACE::CompactionOptionsUniversal* rep; +}; + +static bool SaveError(char** errptr, const Status& s) { + assert(errptr != nullptr); + if (s.ok()) { + return false; + } else if (*errptr == nullptr) { + *errptr = strdup(s.ToString().c_str()); + } else { + // TODO(sanjay): Merge with existing error? + // This is a bug if *errptr is not created by malloc() + free(*errptr); + *errptr = strdup(s.ToString().c_str()); + } + return true; +} + +static char* CopyString(const std::string& str) { + char* result = reinterpret_cast(malloc(sizeof(char) * str.size())); + memcpy(result, str.data(), sizeof(char) * str.size()); + return result; +} + +rocksdb_t* rocksdb_open( + const rocksdb_options_t* options, + const char* name, + char** errptr) { + DB* db; + if (SaveError(errptr, DB::Open(options->rep, std::string(name), &db))) { + return nullptr; + } + rocksdb_t* result = new rocksdb_t; + result->rep = db; + return result; +} + +rocksdb_t* rocksdb_open_with_ttl( + const rocksdb_options_t* options, + const char* name, + int ttl, + char** errptr) { + ROCKSDB_NAMESPACE::DBWithTTL* db; + if (SaveError(errptr, ROCKSDB_NAMESPACE::DBWithTTL::Open( + options->rep, std::string(name), &db, ttl))) { + return nullptr; + } + rocksdb_t* result = new rocksdb_t; + result->rep = db; + return result; +} + +rocksdb_t* rocksdb_open_for_read_only( + const rocksdb_options_t* options, + const char* name, + unsigned char error_if_log_file_exist, + char** errptr) { + DB* db; + if (SaveError(errptr, DB::OpenForReadOnly(options->rep, std::string(name), &db, error_if_log_file_exist))) { + return nullptr; + } + rocksdb_t* result = new rocksdb_t; + result->rep = db; + return result; +} + +rocksdb_t* rocksdb_open_as_secondary(const rocksdb_options_t* options, + const char* name, + const char* secondary_path, + char** errptr) { + DB* db; + if (SaveError(errptr, + DB::OpenAsSecondary(options->rep, std::string(name), + std::string(secondary_path), &db))) { + return nullptr; + } + rocksdb_t* result = new rocksdb_t; + result->rep = db; + return result; +} + +rocksdb_backup_engine_t* rocksdb_backup_engine_open( + const rocksdb_options_t* options, const char* path, char** errptr) { + BackupEngine* be; + if (SaveError(errptr, BackupEngine::Open(options->rep.env, + BackupableDBOptions(path, + nullptr, + true, + options->rep.info_log.get()), + &be))) { + return nullptr; + } + rocksdb_backup_engine_t* result = new rocksdb_backup_engine_t; + result->rep = be; + return result; +} + +void rocksdb_backup_engine_create_new_backup(rocksdb_backup_engine_t* be, + rocksdb_t* db, + char** errptr) { + SaveError(errptr, be->rep->CreateNewBackup(db->rep)); +} + +void rocksdb_backup_engine_create_new_backup_flush(rocksdb_backup_engine_t* be, + rocksdb_t* db, + unsigned char flush_before_backup, + char** errptr) { + SaveError(errptr, be->rep->CreateNewBackup(db->rep, flush_before_backup)); +} + +void rocksdb_backup_engine_purge_old_backups(rocksdb_backup_engine_t* be, + uint32_t num_backups_to_keep, + char** errptr) { + SaveError(errptr, be->rep->PurgeOldBackups(num_backups_to_keep)); +} + +rocksdb_restore_options_t* rocksdb_restore_options_create() { + return new rocksdb_restore_options_t; +} + +void rocksdb_restore_options_destroy(rocksdb_restore_options_t* opt) { + delete opt; +} + +void rocksdb_restore_options_set_keep_log_files(rocksdb_restore_options_t* opt, + int v) { + opt->rep.keep_log_files = v; +} + + +void rocksdb_backup_engine_verify_backup(rocksdb_backup_engine_t* be, + uint32_t backup_id, char** errptr) { + SaveError(errptr, be->rep->VerifyBackup(static_cast(backup_id))); +} + +void rocksdb_backup_engine_restore_db_from_latest_backup( + rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir, + const rocksdb_restore_options_t* restore_options, char** errptr) { + SaveError(errptr, be->rep->RestoreDBFromLatestBackup(std::string(db_dir), + std::string(wal_dir), + restore_options->rep)); +} + +const rocksdb_backup_engine_info_t* rocksdb_backup_engine_get_backup_info( + rocksdb_backup_engine_t* be) { + rocksdb_backup_engine_info_t* result = new rocksdb_backup_engine_info_t; + be->rep->GetBackupInfo(&result->rep); + return result; +} + +int rocksdb_backup_engine_info_count(const rocksdb_backup_engine_info_t* info) { + return static_cast(info->rep.size()); +} + +int64_t rocksdb_backup_engine_info_timestamp( + const rocksdb_backup_engine_info_t* info, int index) { + return info->rep[index].timestamp; +} + +uint32_t rocksdb_backup_engine_info_backup_id( + const rocksdb_backup_engine_info_t* info, int index) { + return info->rep[index].backup_id; +} + +uint64_t rocksdb_backup_engine_info_size( + const rocksdb_backup_engine_info_t* info, int index) { + return info->rep[index].size; +} + +uint32_t rocksdb_backup_engine_info_number_files( + const rocksdb_backup_engine_info_t* info, int index) { + return info->rep[index].number_files; +} + +void rocksdb_backup_engine_info_destroy( + const rocksdb_backup_engine_info_t* info) { + delete info; +} + +void rocksdb_backup_engine_close(rocksdb_backup_engine_t* be) { + delete be->rep; + delete be; +} + +rocksdb_checkpoint_t* rocksdb_checkpoint_object_create(rocksdb_t* db, + char** errptr) { + Checkpoint* checkpoint; + if (SaveError(errptr, Checkpoint::Create(db->rep, &checkpoint))) { + return nullptr; + } + rocksdb_checkpoint_t* result = new rocksdb_checkpoint_t; + result->rep = checkpoint; + return result; +} + +void rocksdb_checkpoint_create(rocksdb_checkpoint_t* checkpoint, + const char* checkpoint_dir, + uint64_t log_size_for_flush, char** errptr) { + SaveError(errptr, checkpoint->rep->CreateCheckpoint( + std::string(checkpoint_dir), log_size_for_flush)); +} + +void rocksdb_checkpoint_object_destroy(rocksdb_checkpoint_t* checkpoint) { + delete checkpoint->rep; + delete checkpoint; +} + +void rocksdb_close(rocksdb_t* db) { + delete db->rep; + delete db; +} + +void rocksdb_options_set_uint64add_merge_operator(rocksdb_options_t* opt) { + opt->rep.merge_operator = + ROCKSDB_NAMESPACE::MergeOperators::CreateUInt64AddOperator(); +} + +rocksdb_t* rocksdb_open_column_families( + const rocksdb_options_t* db_options, const char* name, + int num_column_families, const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, char** errptr) { + std::vector column_families; + for (int i = 0; i < num_column_families; i++) { + column_families.push_back(ColumnFamilyDescriptor( + std::string(column_family_names[i]), + ColumnFamilyOptions(column_family_options[i]->rep))); + } + + DB* db; + std::vector handles; + if (SaveError(errptr, DB::Open(DBOptions(db_options->rep), + std::string(name), column_families, &handles, &db))) { + return nullptr; + } + + for (size_t i = 0; i < handles.size(); i++) { + rocksdb_column_family_handle_t* c_handle = new rocksdb_column_family_handle_t; + c_handle->rep = handles[i]; + column_family_handles[i] = c_handle; + } + rocksdb_t* result = new rocksdb_t; + result->rep = db; + return result; +} + +rocksdb_t* rocksdb_open_for_read_only_column_families( + const rocksdb_options_t* db_options, const char* name, + int num_column_families, const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, + unsigned char error_if_log_file_exist, char** errptr) { + std::vector column_families; + for (int i = 0; i < num_column_families; i++) { + column_families.push_back(ColumnFamilyDescriptor( + std::string(column_family_names[i]), + ColumnFamilyOptions(column_family_options[i]->rep))); + } + + DB* db; + std::vector handles; + if (SaveError(errptr, DB::OpenForReadOnly(DBOptions(db_options->rep), + std::string(name), column_families, &handles, &db, error_if_log_file_exist))) { + return nullptr; + } + + for (size_t i = 0; i < handles.size(); i++) { + rocksdb_column_family_handle_t* c_handle = new rocksdb_column_family_handle_t; + c_handle->rep = handles[i]; + column_family_handles[i] = c_handle; + } + rocksdb_t* result = new rocksdb_t; + result->rep = db; + return result; +} + +rocksdb_t* rocksdb_open_as_secondary_column_families( + const rocksdb_options_t* db_options, const char* name, + const char* secondary_path, int num_column_families, + const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, char** errptr) { + std::vector column_families; + for (int i = 0; i != num_column_families; ++i) { + column_families.emplace_back( + std::string(column_family_names[i]), + ColumnFamilyOptions(column_family_options[i]->rep)); + } + DB* db; + std::vector handles; + if (SaveError(errptr, DB::OpenAsSecondary(DBOptions(db_options->rep), + std::string(name), + std::string(secondary_path), + column_families, &handles, &db))) { + return nullptr; + } + for (size_t i = 0; i != handles.size(); ++i) { + rocksdb_column_family_handle_t* c_handle = + new rocksdb_column_family_handle_t; + c_handle->rep = handles[i]; + column_family_handles[i] = c_handle; + } + rocksdb_t* result = new rocksdb_t; + result->rep = db; + return result; +} + +char** rocksdb_list_column_families( + const rocksdb_options_t* options, + const char* name, + size_t* lencfs, + char** errptr) { + std::vector fams; + SaveError(errptr, + DB::ListColumnFamilies(DBOptions(options->rep), + std::string(name), &fams)); + + *lencfs = fams.size(); + char** column_families = static_cast(malloc(sizeof(char*) * fams.size())); + for (size_t i = 0; i < fams.size(); i++) { + column_families[i] = strdup(fams[i].c_str()); + } + return column_families; +} + +void rocksdb_list_column_families_destroy(char** list, size_t len) { + for (size_t i = 0; i < len; ++i) { + free(list[i]); + } + free(list); +} + +rocksdb_column_family_handle_t* rocksdb_create_column_family( + rocksdb_t* db, + const rocksdb_options_t* column_family_options, + const char* column_family_name, + char** errptr) { + rocksdb_column_family_handle_t* handle = new rocksdb_column_family_handle_t; + SaveError(errptr, + db->rep->CreateColumnFamily(ColumnFamilyOptions(column_family_options->rep), + std::string(column_family_name), &(handle->rep))); + return handle; +} + +void rocksdb_drop_column_family( + rocksdb_t* db, + rocksdb_column_family_handle_t* handle, + char** errptr) { + SaveError(errptr, db->rep->DropColumnFamily(handle->rep)); +} + +void rocksdb_column_family_handle_destroy(rocksdb_column_family_handle_t* handle) { + delete handle->rep; + delete handle; +} + +void rocksdb_put( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, + const char* key, size_t keylen, + const char* val, size_t vallen, + char** errptr) { + SaveError(errptr, + db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen))); +} + +void rocksdb_put_cf( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t keylen, + const char* val, size_t vallen, + char** errptr) { + SaveError(errptr, + db->rep->Put(options->rep, column_family->rep, + Slice(key, keylen), Slice(val, vallen))); +} + +void rocksdb_delete( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, + const char* key, size_t keylen, + char** errptr) { + SaveError(errptr, db->rep->Delete(options->rep, Slice(key, keylen))); +} + +void rocksdb_delete_cf( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t keylen, + char** errptr) { + SaveError(errptr, db->rep->Delete(options->rep, column_family->rep, + Slice(key, keylen))); +} + +void rocksdb_delete_range_cf(rocksdb_t* db, + const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* start_key, size_t start_key_len, + const char* end_key, size_t end_key_len, + char** errptr) { + SaveError(errptr, db->rep->DeleteRange(options->rep, column_family->rep, + Slice(start_key, start_key_len), + Slice(end_key, end_key_len))); +} + +void rocksdb_merge( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, + const char* key, size_t keylen, + const char* val, size_t vallen, + char** errptr) { + SaveError(errptr, + db->rep->Merge(options->rep, Slice(key, keylen), Slice(val, vallen))); +} + +void rocksdb_merge_cf( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t keylen, + const char* val, size_t vallen, + char** errptr) { + SaveError(errptr, + db->rep->Merge(options->rep, column_family->rep, + Slice(key, keylen), Slice(val, vallen))); +} + +void rocksdb_write( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, + rocksdb_writebatch_t* batch, + char** errptr) { + SaveError(errptr, db->rep->Write(options->rep, &batch->rep)); +} + +char* rocksdb_get( + rocksdb_t* db, + const rocksdb_readoptions_t* options, + const char* key, size_t keylen, + size_t* vallen, + char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp); + if (s.ok()) { + *vallen = tmp.size(); + result = CopyString(tmp); + } else { + *vallen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +char* rocksdb_get_cf( + rocksdb_t* db, + const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t keylen, + size_t* vallen, + char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = db->rep->Get(options->rep, column_family->rep, + Slice(key, keylen), &tmp); + if (s.ok()) { + *vallen = tmp.size(); + result = CopyString(tmp); + } else { + *vallen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +void rocksdb_multi_get( + rocksdb_t* db, + const rocksdb_readoptions_t* options, + size_t num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, + char** values_list, size_t* values_list_sizes, + char** errs) { + std::vector keys(num_keys); + for (size_t i = 0; i < num_keys; i++) { + keys[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + std::vector values(num_keys); + std::vector statuses = db->rep->MultiGet(options->rep, keys, &values); + for (size_t i = 0; i < num_keys; i++) { + if (statuses[i].ok()) { + values_list[i] = CopyString(values[i]); + values_list_sizes[i] = values[i].size(); + errs[i] = nullptr; + } else { + values_list[i] = nullptr; + values_list_sizes[i] = 0; + if (!statuses[i].IsNotFound()) { + errs[i] = strdup(statuses[i].ToString().c_str()); + } else { + errs[i] = nullptr; + } + } + } +} + +void rocksdb_multi_get_cf( + rocksdb_t* db, + const rocksdb_readoptions_t* options, + const rocksdb_column_family_handle_t* const* column_families, + size_t num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, + char** values_list, size_t* values_list_sizes, + char** errs) { + std::vector keys(num_keys); + std::vector cfs(num_keys); + for (size_t i = 0; i < num_keys; i++) { + keys[i] = Slice(keys_list[i], keys_list_sizes[i]); + cfs[i] = column_families[i]->rep; + } + std::vector values(num_keys); + std::vector statuses = db->rep->MultiGet(options->rep, cfs, keys, &values); + for (size_t i = 0; i < num_keys; i++) { + if (statuses[i].ok()) { + values_list[i] = CopyString(values[i]); + values_list_sizes[i] = values[i].size(); + errs[i] = nullptr; + } else { + values_list[i] = nullptr; + values_list_sizes[i] = 0; + if (!statuses[i].IsNotFound()) { + errs[i] = strdup(statuses[i].ToString().c_str()); + } else { + errs[i] = nullptr; + } + } + } +} + +rocksdb_iterator_t* rocksdb_create_iterator( + rocksdb_t* db, + const rocksdb_readoptions_t* options) { + rocksdb_iterator_t* result = new rocksdb_iterator_t; + result->rep = db->rep->NewIterator(options->rep); + return result; +} + +rocksdb_wal_iterator_t* rocksdb_get_updates_since( + rocksdb_t* db, uint64_t seq_number, + const rocksdb_wal_readoptions_t* options, + char** errptr) { + std::unique_ptr iter; + TransactionLogIterator::ReadOptions ro; + if (options!=nullptr) { + ro = options->rep; + } + if (SaveError(errptr, db->rep->GetUpdatesSince(seq_number, &iter, ro))) { + return nullptr; + } + rocksdb_wal_iterator_t* result = new rocksdb_wal_iterator_t; + result->rep = iter.release(); + return result; +} + +void rocksdb_wal_iter_next(rocksdb_wal_iterator_t* iter) { + iter->rep->Next(); +} + +unsigned char rocksdb_wal_iter_valid(const rocksdb_wal_iterator_t* iter) { + return iter->rep->Valid(); +} + +void rocksdb_wal_iter_status (const rocksdb_wal_iterator_t* iter, char** errptr) { + SaveError(errptr, iter->rep->status()); +} + +void rocksdb_wal_iter_destroy (const rocksdb_wal_iterator_t* iter) { + delete iter->rep; + delete iter; +} + +rocksdb_writebatch_t* rocksdb_wal_iter_get_batch (const rocksdb_wal_iterator_t* iter, uint64_t* seq) { + rocksdb_writebatch_t* result = rocksdb_writebatch_create(); + BatchResult wal_batch = iter->rep->GetBatch(); + result->rep = std::move(*wal_batch.writeBatchPtr); + if (seq != nullptr) { + *seq = wal_batch.sequence; + } + return result; +} + +uint64_t rocksdb_get_latest_sequence_number (rocksdb_t *db) { + return db->rep->GetLatestSequenceNumber(); +} + +rocksdb_iterator_t* rocksdb_create_iterator_cf( + rocksdb_t* db, + const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family) { + rocksdb_iterator_t* result = new rocksdb_iterator_t; + result->rep = db->rep->NewIterator(options->rep, column_family->rep); + return result; +} + +void rocksdb_create_iterators( + rocksdb_t *db, + rocksdb_readoptions_t* opts, + rocksdb_column_family_handle_t** column_families, + rocksdb_iterator_t** iterators, + size_t size, + char** errptr) { + std::vector column_families_vec; + for (size_t i = 0; i < size; i++) { + column_families_vec.push_back(column_families[i]->rep); + } + + std::vector res; + Status status = db->rep->NewIterators(opts->rep, column_families_vec, &res); + assert(res.size() == size); + if (SaveError(errptr, status)) { + return; + } + + for (size_t i = 0; i < size; i++) { + iterators[i] = new rocksdb_iterator_t; + iterators[i]->rep = res[i]; + } +} + +const rocksdb_snapshot_t* rocksdb_create_snapshot( + rocksdb_t* db) { + rocksdb_snapshot_t* result = new rocksdb_snapshot_t; + result->rep = db->rep->GetSnapshot(); + return result; +} + +void rocksdb_release_snapshot( + rocksdb_t* db, + const rocksdb_snapshot_t* snapshot) { + db->rep->ReleaseSnapshot(snapshot->rep); + delete snapshot; +} + +char* rocksdb_property_value( + rocksdb_t* db, + const char* propname) { + std::string tmp; + if (db->rep->GetProperty(Slice(propname), &tmp)) { + // We use strdup() since we expect human readable output. + return strdup(tmp.c_str()); + } else { + return nullptr; + } +} + +int rocksdb_property_int( + rocksdb_t* db, + const char* propname, + uint64_t *out_val) { + if (db->rep->GetIntProperty(Slice(propname), out_val)) { + return 0; + } else { + return -1; + } +} + +int rocksdb_property_int_cf( + rocksdb_t* db, + rocksdb_column_family_handle_t* column_family, + const char* propname, + uint64_t *out_val) { + if (db->rep->GetIntProperty(column_family->rep, Slice(propname), out_val)) { + return 0; + } else { + return -1; + } +} + +char* rocksdb_property_value_cf( + rocksdb_t* db, + rocksdb_column_family_handle_t* column_family, + const char* propname) { + std::string tmp; + if (db->rep->GetProperty(column_family->rep, Slice(propname), &tmp)) { + // We use strdup() since we expect human readable output. + return strdup(tmp.c_str()); + } else { + return nullptr; + } +} + +void rocksdb_approximate_sizes( + rocksdb_t* db, + int num_ranges, + const char* const* range_start_key, const size_t* range_start_key_len, + const char* const* range_limit_key, const size_t* range_limit_key_len, + uint64_t* sizes) { + Range* ranges = new Range[num_ranges]; + for (int i = 0; i < num_ranges; i++) { + ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]); + ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]); + } + db->rep->GetApproximateSizes(ranges, num_ranges, sizes); + delete[] ranges; +} + +void rocksdb_approximate_sizes_cf( + rocksdb_t* db, + rocksdb_column_family_handle_t* column_family, + int num_ranges, + const char* const* range_start_key, const size_t* range_start_key_len, + const char* const* range_limit_key, const size_t* range_limit_key_len, + uint64_t* sizes) { + Range* ranges = new Range[num_ranges]; + for (int i = 0; i < num_ranges; i++) { + ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]); + ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]); + } + db->rep->GetApproximateSizes(column_family->rep, ranges, num_ranges, sizes); + delete[] ranges; +} + +void rocksdb_delete_file( + rocksdb_t* db, + const char* name) { + db->rep->DeleteFile(name); +} + +const rocksdb_livefiles_t* rocksdb_livefiles( + rocksdb_t* db) { + rocksdb_livefiles_t* result = new rocksdb_livefiles_t; + db->rep->GetLiveFilesMetaData(&result->rep); + return result; +} + +void rocksdb_compact_range( + rocksdb_t* db, + const char* start_key, size_t start_key_len, + const char* limit_key, size_t limit_key_len) { + Slice a, b; + db->rep->CompactRange( + CompactRangeOptions(), + // Pass nullptr Slice if corresponding "const char*" is nullptr + (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr), + (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr)); +} + +void rocksdb_compact_range_cf( + rocksdb_t* db, + rocksdb_column_family_handle_t* column_family, + const char* start_key, size_t start_key_len, + const char* limit_key, size_t limit_key_len) { + Slice a, b; + db->rep->CompactRange( + CompactRangeOptions(), column_family->rep, + // Pass nullptr Slice if corresponding "const char*" is nullptr + (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr), + (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr)); +} + +void rocksdb_compact_range_opt(rocksdb_t* db, rocksdb_compactoptions_t* opt, + const char* start_key, size_t start_key_len, + const char* limit_key, size_t limit_key_len) { + Slice a, b; + db->rep->CompactRange( + opt->rep, + // Pass nullptr Slice if corresponding "const char*" is nullptr + (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr), + (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr)); +} + +void rocksdb_compact_range_cf_opt(rocksdb_t* db, + rocksdb_column_family_handle_t* column_family, + rocksdb_compactoptions_t* opt, + const char* start_key, size_t start_key_len, + const char* limit_key, size_t limit_key_len) { + Slice a, b; + db->rep->CompactRange( + opt->rep, column_family->rep, + // Pass nullptr Slice if corresponding "const char*" is nullptr + (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr), + (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr)); +} + +void rocksdb_flush( + rocksdb_t* db, + const rocksdb_flushoptions_t* options, + char** errptr) { + SaveError(errptr, db->rep->Flush(options->rep)); +} + +void rocksdb_flush_cf( + rocksdb_t* db, + const rocksdb_flushoptions_t* options, + rocksdb_column_family_handle_t* column_family, + char** errptr) { + SaveError(errptr, db->rep->Flush(options->rep, column_family->rep)); +} + +void rocksdb_disable_file_deletions( + rocksdb_t* db, + char** errptr) { + SaveError(errptr, db->rep->DisableFileDeletions()); +} + +void rocksdb_enable_file_deletions( + rocksdb_t* db, + unsigned char force, + char** errptr) { + SaveError(errptr, db->rep->EnableFileDeletions(force)); +} + +void rocksdb_destroy_db( + const rocksdb_options_t* options, + const char* name, + char** errptr) { + SaveError(errptr, DestroyDB(name, options->rep)); +} + +void rocksdb_repair_db( + const rocksdb_options_t* options, + const char* name, + char** errptr) { + SaveError(errptr, RepairDB(name, options->rep)); +} + +void rocksdb_iter_destroy(rocksdb_iterator_t* iter) { + delete iter->rep; + delete iter; +} + +unsigned char rocksdb_iter_valid(const rocksdb_iterator_t* iter) { + return iter->rep->Valid(); +} + +void rocksdb_iter_seek_to_first(rocksdb_iterator_t* iter) { + iter->rep->SeekToFirst(); +} + +void rocksdb_iter_seek_to_last(rocksdb_iterator_t* iter) { + iter->rep->SeekToLast(); +} + +void rocksdb_iter_seek(rocksdb_iterator_t* iter, const char* k, size_t klen) { + iter->rep->Seek(Slice(k, klen)); +} + +void rocksdb_iter_seek_for_prev(rocksdb_iterator_t* iter, const char* k, + size_t klen) { + iter->rep->SeekForPrev(Slice(k, klen)); +} + +void rocksdb_iter_next(rocksdb_iterator_t* iter) { + iter->rep->Next(); +} + +void rocksdb_iter_prev(rocksdb_iterator_t* iter) { + iter->rep->Prev(); +} + +const char* rocksdb_iter_key(const rocksdb_iterator_t* iter, size_t* klen) { + Slice s = iter->rep->key(); + *klen = s.size(); + return s.data(); +} + +const char* rocksdb_iter_value(const rocksdb_iterator_t* iter, size_t* vlen) { + Slice s = iter->rep->value(); + *vlen = s.size(); + return s.data(); +} + +void rocksdb_iter_get_error(const rocksdb_iterator_t* iter, char** errptr) { + SaveError(errptr, iter->rep->status()); +} + +rocksdb_writebatch_t* rocksdb_writebatch_create() { + return new rocksdb_writebatch_t; +} + +rocksdb_writebatch_t* rocksdb_writebatch_create_from(const char* rep, + size_t size) { + rocksdb_writebatch_t* b = new rocksdb_writebatch_t; + b->rep = WriteBatch(std::string(rep, size)); + return b; +} + +void rocksdb_writebatch_destroy(rocksdb_writebatch_t* b) { + delete b; +} + +void rocksdb_writebatch_clear(rocksdb_writebatch_t* b) { + b->rep.Clear(); +} + +int rocksdb_writebatch_count(rocksdb_writebatch_t* b) { + return b->rep.Count(); +} + +void rocksdb_writebatch_put( + rocksdb_writebatch_t* b, + const char* key, size_t klen, + const char* val, size_t vlen) { + b->rep.Put(Slice(key, klen), Slice(val, vlen)); +} + +void rocksdb_writebatch_put_cf( + rocksdb_writebatch_t* b, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, + const char* val, size_t vlen) { + b->rep.Put(column_family->rep, Slice(key, klen), Slice(val, vlen)); +} + +void rocksdb_writebatch_putv( + rocksdb_writebatch_t* b, + int num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, + int num_values, const char* const* values_list, + const size_t* values_list_sizes) { + std::vector key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + std::vector value_slices(num_values); + for (int i = 0; i < num_values; i++) { + value_slices[i] = Slice(values_list[i], values_list_sizes[i]); + } + b->rep.Put(SliceParts(key_slices.data(), num_keys), + SliceParts(value_slices.data(), num_values)); +} + +void rocksdb_writebatch_putv_cf( + rocksdb_writebatch_t* b, + rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, + int num_values, const char* const* values_list, + const size_t* values_list_sizes) { + std::vector key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + std::vector value_slices(num_values); + for (int i = 0; i < num_values; i++) { + value_slices[i] = Slice(values_list[i], values_list_sizes[i]); + } + b->rep.Put(column_family->rep, SliceParts(key_slices.data(), num_keys), + SliceParts(value_slices.data(), num_values)); +} + +void rocksdb_writebatch_merge( + rocksdb_writebatch_t* b, + const char* key, size_t klen, + const char* val, size_t vlen) { + b->rep.Merge(Slice(key, klen), Slice(val, vlen)); +} + +void rocksdb_writebatch_merge_cf( + rocksdb_writebatch_t* b, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, + const char* val, size_t vlen) { + b->rep.Merge(column_family->rep, Slice(key, klen), Slice(val, vlen)); +} + +void rocksdb_writebatch_mergev( + rocksdb_writebatch_t* b, + int num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, + int num_values, const char* const* values_list, + const size_t* values_list_sizes) { + std::vector key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + std::vector value_slices(num_values); + for (int i = 0; i < num_values; i++) { + value_slices[i] = Slice(values_list[i], values_list_sizes[i]); + } + b->rep.Merge(SliceParts(key_slices.data(), num_keys), + SliceParts(value_slices.data(), num_values)); +} + +void rocksdb_writebatch_mergev_cf( + rocksdb_writebatch_t* b, + rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, + int num_values, const char* const* values_list, + const size_t* values_list_sizes) { + std::vector key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + std::vector value_slices(num_values); + for (int i = 0; i < num_values; i++) { + value_slices[i] = Slice(values_list[i], values_list_sizes[i]); + } + b->rep.Merge(column_family->rep, SliceParts(key_slices.data(), num_keys), + SliceParts(value_slices.data(), num_values)); +} + +void rocksdb_writebatch_delete( + rocksdb_writebatch_t* b, + const char* key, size_t klen) { + b->rep.Delete(Slice(key, klen)); +} + +void rocksdb_writebatch_delete_cf( + rocksdb_writebatch_t* b, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen) { + b->rep.Delete(column_family->rep, Slice(key, klen)); +} + +void rocksdb_writebatch_deletev( + rocksdb_writebatch_t* b, + int num_keys, const char* const* keys_list, + const size_t* keys_list_sizes) { + std::vector key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + b->rep.Delete(SliceParts(key_slices.data(), num_keys)); +} + +void rocksdb_writebatch_deletev_cf( + rocksdb_writebatch_t* b, + rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, + const size_t* keys_list_sizes) { + std::vector key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + b->rep.Delete(column_family->rep, SliceParts(key_slices.data(), num_keys)); +} + +void rocksdb_writebatch_delete_range(rocksdb_writebatch_t* b, + const char* start_key, + size_t start_key_len, const char* end_key, + size_t end_key_len) { + b->rep.DeleteRange(Slice(start_key, start_key_len), + Slice(end_key, end_key_len)); +} + +void rocksdb_writebatch_delete_range_cf( + rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, + const char* start_key, size_t start_key_len, const char* end_key, + size_t end_key_len) { + b->rep.DeleteRange(column_family->rep, Slice(start_key, start_key_len), + Slice(end_key, end_key_len)); +} + +void rocksdb_writebatch_delete_rangev(rocksdb_writebatch_t* b, int num_keys, + const char* const* start_keys_list, + const size_t* start_keys_list_sizes, + const char* const* end_keys_list, + const size_t* end_keys_list_sizes) { + std::vector start_key_slices(num_keys); + std::vector end_key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]); + end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]); + } + b->rep.DeleteRange(SliceParts(start_key_slices.data(), num_keys), + SliceParts(end_key_slices.data(), num_keys)); +} + +void rocksdb_writebatch_delete_rangev_cf( + rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* start_keys_list, + const size_t* start_keys_list_sizes, const char* const* end_keys_list, + const size_t* end_keys_list_sizes) { + std::vector start_key_slices(num_keys); + std::vector end_key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]); + end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]); + } + b->rep.DeleteRange(column_family->rep, + SliceParts(start_key_slices.data(), num_keys), + SliceParts(end_key_slices.data(), num_keys)); +} + +void rocksdb_writebatch_put_log_data( + rocksdb_writebatch_t* b, + const char* blob, size_t len) { + b->rep.PutLogData(Slice(blob, len)); +} + +class H : public WriteBatch::Handler { + public: + void* state_; + void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen); + void (*deleted_)(void*, const char* k, size_t klen); + void Put(const Slice& key, const Slice& value) override { + (*put_)(state_, key.data(), key.size(), value.data(), value.size()); + } + void Delete(const Slice& key) override { + (*deleted_)(state_, key.data(), key.size()); + } +}; + +void rocksdb_writebatch_iterate( + rocksdb_writebatch_t* b, + void* state, + void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen), + void (*deleted)(void*, const char* k, size_t klen)) { + H handler; + handler.state_ = state; + handler.put_ = put; + handler.deleted_ = deleted; + b->rep.Iterate(&handler); +} + +const char* rocksdb_writebatch_data(rocksdb_writebatch_t* b, size_t* size) { + *size = b->rep.GetDataSize(); + return b->rep.Data().c_str(); +} + +void rocksdb_writebatch_set_save_point(rocksdb_writebatch_t* b) { + b->rep.SetSavePoint(); +} + +void rocksdb_writebatch_rollback_to_save_point(rocksdb_writebatch_t* b, + char** errptr) { + SaveError(errptr, b->rep.RollbackToSavePoint()); +} + +void rocksdb_writebatch_pop_save_point(rocksdb_writebatch_t* b, char** errptr) { + SaveError(errptr, b->rep.PopSavePoint()); +} + +rocksdb_writebatch_wi_t* rocksdb_writebatch_wi_create(size_t reserved_bytes, unsigned char overwrite_key) { + rocksdb_writebatch_wi_t* b = new rocksdb_writebatch_wi_t; + b->rep = new WriteBatchWithIndex(BytewiseComparator(), reserved_bytes, overwrite_key); + return b; +} + +void rocksdb_writebatch_wi_destroy(rocksdb_writebatch_wi_t* b) { + if (b->rep) { + delete b->rep; + } + delete b; +} + +void rocksdb_writebatch_wi_clear(rocksdb_writebatch_wi_t* b) { + b->rep->Clear(); +} + +int rocksdb_writebatch_wi_count(rocksdb_writebatch_wi_t* b) { + return b->rep->GetWriteBatch()->Count(); +} + +void rocksdb_writebatch_wi_put( + rocksdb_writebatch_wi_t* b, + const char* key, size_t klen, + const char* val, size_t vlen) { + b->rep->Put(Slice(key, klen), Slice(val, vlen)); +} + +void rocksdb_writebatch_wi_put_cf( + rocksdb_writebatch_wi_t* b, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, + const char* val, size_t vlen) { + b->rep->Put(column_family->rep, Slice(key, klen), Slice(val, vlen)); +} + +void rocksdb_writebatch_wi_putv( + rocksdb_writebatch_wi_t* b, + int num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, + int num_values, const char* const* values_list, + const size_t* values_list_sizes) { + std::vector key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + std::vector value_slices(num_values); + for (int i = 0; i < num_values; i++) { + value_slices[i] = Slice(values_list[i], values_list_sizes[i]); + } + b->rep->Put(SliceParts(key_slices.data(), num_keys), + SliceParts(value_slices.data(), num_values)); +} + +void rocksdb_writebatch_wi_putv_cf( + rocksdb_writebatch_wi_t* b, + rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, + int num_values, const char* const* values_list, + const size_t* values_list_sizes) { + std::vector key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + std::vector value_slices(num_values); + for (int i = 0; i < num_values; i++) { + value_slices[i] = Slice(values_list[i], values_list_sizes[i]); + } + b->rep->Put(column_family->rep, SliceParts(key_slices.data(), num_keys), + SliceParts(value_slices.data(), num_values)); +} + +void rocksdb_writebatch_wi_merge( + rocksdb_writebatch_wi_t* b, + const char* key, size_t klen, + const char* val, size_t vlen) { + b->rep->Merge(Slice(key, klen), Slice(val, vlen)); +} + +void rocksdb_writebatch_wi_merge_cf( + rocksdb_writebatch_wi_t* b, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, + const char* val, size_t vlen) { + b->rep->Merge(column_family->rep, Slice(key, klen), Slice(val, vlen)); +} + +void rocksdb_writebatch_wi_mergev( + rocksdb_writebatch_wi_t* b, + int num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, + int num_values, const char* const* values_list, + const size_t* values_list_sizes) { + std::vector key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + std::vector value_slices(num_values); + for (int i = 0; i < num_values; i++) { + value_slices[i] = Slice(values_list[i], values_list_sizes[i]); + } + b->rep->Merge(SliceParts(key_slices.data(), num_keys), + SliceParts(value_slices.data(), num_values)); +} + +void rocksdb_writebatch_wi_mergev_cf( + rocksdb_writebatch_wi_t* b, + rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, + int num_values, const char* const* values_list, + const size_t* values_list_sizes) { + std::vector key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + std::vector value_slices(num_values); + for (int i = 0; i < num_values; i++) { + value_slices[i] = Slice(values_list[i], values_list_sizes[i]); + } + b->rep->Merge(column_family->rep, SliceParts(key_slices.data(), num_keys), + SliceParts(value_slices.data(), num_values)); +} + +void rocksdb_writebatch_wi_delete( + rocksdb_writebatch_wi_t* b, + const char* key, size_t klen) { + b->rep->Delete(Slice(key, klen)); +} + +void rocksdb_writebatch_wi_delete_cf( + rocksdb_writebatch_wi_t* b, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen) { + b->rep->Delete(column_family->rep, Slice(key, klen)); +} + +void rocksdb_writebatch_wi_deletev( + rocksdb_writebatch_wi_t* b, + int num_keys, const char* const* keys_list, + const size_t* keys_list_sizes) { + std::vector key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + b->rep->Delete(SliceParts(key_slices.data(), num_keys)); +} + +void rocksdb_writebatch_wi_deletev_cf( + rocksdb_writebatch_wi_t* b, + rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, + const size_t* keys_list_sizes) { + std::vector key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + b->rep->Delete(column_family->rep, SliceParts(key_slices.data(), num_keys)); +} + +void rocksdb_writebatch_wi_delete_range(rocksdb_writebatch_wi_t* b, + const char* start_key, + size_t start_key_len, const char* end_key, + size_t end_key_len) { + b->rep->DeleteRange(Slice(start_key, start_key_len), + Slice(end_key, end_key_len)); +} + +void rocksdb_writebatch_wi_delete_range_cf( + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + const char* start_key, size_t start_key_len, const char* end_key, + size_t end_key_len) { + b->rep->DeleteRange(column_family->rep, Slice(start_key, start_key_len), + Slice(end_key, end_key_len)); +} + +void rocksdb_writebatch_wi_delete_rangev(rocksdb_writebatch_wi_t* b, int num_keys, + const char* const* start_keys_list, + const size_t* start_keys_list_sizes, + const char* const* end_keys_list, + const size_t* end_keys_list_sizes) { + std::vector start_key_slices(num_keys); + std::vector end_key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]); + end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]); + } + b->rep->DeleteRange(SliceParts(start_key_slices.data(), num_keys), + SliceParts(end_key_slices.data(), num_keys)); +} + +void rocksdb_writebatch_wi_delete_rangev_cf( + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* start_keys_list, + const size_t* start_keys_list_sizes, const char* const* end_keys_list, + const size_t* end_keys_list_sizes) { + std::vector start_key_slices(num_keys); + std::vector end_key_slices(num_keys); + for (int i = 0; i < num_keys; i++) { + start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]); + end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]); + } + b->rep->DeleteRange(column_family->rep, + SliceParts(start_key_slices.data(), num_keys), + SliceParts(end_key_slices.data(), num_keys)); +} + +void rocksdb_writebatch_wi_put_log_data( + rocksdb_writebatch_wi_t* b, + const char* blob, size_t len) { + b->rep->PutLogData(Slice(blob, len)); +} + +void rocksdb_writebatch_wi_iterate( + rocksdb_writebatch_wi_t* b, + void* state, + void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen), + void (*deleted)(void*, const char* k, size_t klen)) { + H handler; + handler.state_ = state; + handler.put_ = put; + handler.deleted_ = deleted; + b->rep->GetWriteBatch()->Iterate(&handler); +} + +const char* rocksdb_writebatch_wi_data(rocksdb_writebatch_wi_t* b, size_t* size) { + WriteBatch* wb = b->rep->GetWriteBatch(); + *size = wb->GetDataSize(); + return wb->Data().c_str(); +} + +void rocksdb_writebatch_wi_set_save_point(rocksdb_writebatch_wi_t* b) { + b->rep->SetSavePoint(); +} + +void rocksdb_writebatch_wi_rollback_to_save_point(rocksdb_writebatch_wi_t* b, + char** errptr) { + SaveError(errptr, b->rep->RollbackToSavePoint()); +} + +rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base( + rocksdb_writebatch_wi_t* wbwi, + rocksdb_iterator_t* base_iterator) { + rocksdb_iterator_t* result = new rocksdb_iterator_t; + result->rep = wbwi->rep->NewIteratorWithBase(base_iterator->rep); + delete base_iterator; + return result; +} + +rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base_cf( + rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator, + rocksdb_column_family_handle_t* column_family) { + rocksdb_iterator_t* result = new rocksdb_iterator_t; + result->rep = + wbwi->rep->NewIteratorWithBase(column_family->rep, base_iterator->rep); + delete base_iterator; + return result; +} + +char* rocksdb_writebatch_wi_get_from_batch( + rocksdb_writebatch_wi_t* wbwi, + const rocksdb_options_t* options, + const char* key, size_t keylen, + size_t* vallen, + char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = wbwi->rep->GetFromBatch(options->rep, Slice(key, keylen), &tmp); + if (s.ok()) { + *vallen = tmp.size(); + result = CopyString(tmp); + } else { + *vallen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +char* rocksdb_writebatch_wi_get_from_batch_cf( + rocksdb_writebatch_wi_t* wbwi, + const rocksdb_options_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t keylen, + size_t* vallen, + char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = wbwi->rep->GetFromBatch(column_family->rep, options->rep, + Slice(key, keylen), &tmp); + if (s.ok()) { + *vallen = tmp.size(); + result = CopyString(tmp); + } else { + *vallen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +char* rocksdb_writebatch_wi_get_from_batch_and_db( + rocksdb_writebatch_wi_t* wbwi, + rocksdb_t* db, + const rocksdb_readoptions_t* options, + const char* key, size_t keylen, + size_t* vallen, + char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = wbwi->rep->GetFromBatchAndDB(db->rep, options->rep, Slice(key, keylen), &tmp); + if (s.ok()) { + *vallen = tmp.size(); + result = CopyString(tmp); + } else { + *vallen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +char* rocksdb_writebatch_wi_get_from_batch_and_db_cf( + rocksdb_writebatch_wi_t* wbwi, + rocksdb_t* db, + const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t keylen, + size_t* vallen, + char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = wbwi->rep->GetFromBatchAndDB(db->rep, options->rep, column_family->rep, + Slice(key, keylen), &tmp); + if (s.ok()) { + *vallen = tmp.size(); + result = CopyString(tmp); + } else { + *vallen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +void rocksdb_write_writebatch_wi( + rocksdb_t* db, + const rocksdb_writeoptions_t* options, + rocksdb_writebatch_wi_t* wbwi, + char** errptr) { + WriteBatch* wb = wbwi->rep->GetWriteBatch(); + SaveError(errptr, db->rep->Write(options->rep, wb)); +} + +rocksdb_block_based_table_options_t* +rocksdb_block_based_options_create() { + return new rocksdb_block_based_table_options_t; +} + +void rocksdb_block_based_options_destroy( + rocksdb_block_based_table_options_t* options) { + delete options; +} + +void rocksdb_block_based_options_set_block_size( + rocksdb_block_based_table_options_t* options, size_t block_size) { + options->rep.block_size = block_size; +} + +void rocksdb_block_based_options_set_block_size_deviation( + rocksdb_block_based_table_options_t* options, int block_size_deviation) { + options->rep.block_size_deviation = block_size_deviation; +} + +void rocksdb_block_based_options_set_block_restart_interval( + rocksdb_block_based_table_options_t* options, int block_restart_interval) { + options->rep.block_restart_interval = block_restart_interval; +} + +void rocksdb_block_based_options_set_index_block_restart_interval( + rocksdb_block_based_table_options_t* options, int index_block_restart_interval) { + options->rep.index_block_restart_interval = index_block_restart_interval; +} + +void rocksdb_block_based_options_set_metadata_block_size( + rocksdb_block_based_table_options_t* options, uint64_t metadata_block_size) { + options->rep.metadata_block_size = metadata_block_size; +} + +void rocksdb_block_based_options_set_partition_filters( + rocksdb_block_based_table_options_t* options, unsigned char partition_filters) { + options->rep.partition_filters = partition_filters; +} + +void rocksdb_block_based_options_set_use_delta_encoding( + rocksdb_block_based_table_options_t* options, unsigned char use_delta_encoding) { + options->rep.use_delta_encoding = use_delta_encoding; +} + +void rocksdb_block_based_options_set_filter_policy( + rocksdb_block_based_table_options_t* options, + rocksdb_filterpolicy_t* filter_policy) { + options->rep.filter_policy.reset(filter_policy); +} + +void rocksdb_block_based_options_set_no_block_cache( + rocksdb_block_based_table_options_t* options, + unsigned char no_block_cache) { + options->rep.no_block_cache = no_block_cache; +} + +void rocksdb_block_based_options_set_block_cache( + rocksdb_block_based_table_options_t* options, + rocksdb_cache_t* block_cache) { + if (block_cache) { + options->rep.block_cache = block_cache->rep; + } +} + +void rocksdb_block_based_options_set_block_cache_compressed( + rocksdb_block_based_table_options_t* options, + rocksdb_cache_t* block_cache_compressed) { + if (block_cache_compressed) { + options->rep.block_cache_compressed = block_cache_compressed->rep; + } +} + +void rocksdb_block_based_options_set_whole_key_filtering( + rocksdb_block_based_table_options_t* options, unsigned char v) { + options->rep.whole_key_filtering = v; +} + +void rocksdb_block_based_options_set_format_version( + rocksdb_block_based_table_options_t* options, int v) { + options->rep.format_version = v; +} + +void rocksdb_block_based_options_set_index_type( + rocksdb_block_based_table_options_t* options, int v) { + options->rep.index_type = static_cast(v); +} + +void rocksdb_block_based_options_set_data_block_index_type( + rocksdb_block_based_table_options_t* options, int v) { + options->rep.data_block_index_type = + static_cast(v); +} + +void rocksdb_block_based_options_set_data_block_hash_ratio( + rocksdb_block_based_table_options_t* options, double v) { + options->rep.data_block_hash_table_util_ratio = v; +} + +void rocksdb_block_based_options_set_hash_index_allow_collision( + rocksdb_block_based_table_options_t* options, unsigned char v) { + options->rep.hash_index_allow_collision = v; +} + +void rocksdb_block_based_options_set_cache_index_and_filter_blocks( + rocksdb_block_based_table_options_t* options, unsigned char v) { + options->rep.cache_index_and_filter_blocks = v; +} + +void rocksdb_block_based_options_set_cache_index_and_filter_blocks_with_high_priority( + rocksdb_block_based_table_options_t* options, unsigned char v) { + options->rep.cache_index_and_filter_blocks_with_high_priority = v; +} + +void rocksdb_block_based_options_set_pin_l0_filter_and_index_blocks_in_cache( + rocksdb_block_based_table_options_t* options, unsigned char v) { + options->rep.pin_l0_filter_and_index_blocks_in_cache = v; +} + +void rocksdb_block_based_options_set_pin_top_level_index_and_filter( + rocksdb_block_based_table_options_t* options, unsigned char v) { + options->rep.pin_top_level_index_and_filter = v; +} + +void rocksdb_options_set_block_based_table_factory( + rocksdb_options_t *opt, + rocksdb_block_based_table_options_t* table_options) { + if (table_options) { + opt->rep.table_factory.reset( + ROCKSDB_NAMESPACE::NewBlockBasedTableFactory(table_options->rep)); + } +} + +rocksdb_cuckoo_table_options_t* +rocksdb_cuckoo_options_create() { + return new rocksdb_cuckoo_table_options_t; +} + +void rocksdb_cuckoo_options_destroy( + rocksdb_cuckoo_table_options_t* options) { + delete options; +} + +void rocksdb_cuckoo_options_set_hash_ratio( + rocksdb_cuckoo_table_options_t* options, double v) { + options->rep.hash_table_ratio = v; +} + +void rocksdb_cuckoo_options_set_max_search_depth( + rocksdb_cuckoo_table_options_t* options, uint32_t v) { + options->rep.max_search_depth = v; +} + +void rocksdb_cuckoo_options_set_cuckoo_block_size( + rocksdb_cuckoo_table_options_t* options, uint32_t v) { + options->rep.cuckoo_block_size = v; +} + +void rocksdb_cuckoo_options_set_identity_as_first_hash( + rocksdb_cuckoo_table_options_t* options, unsigned char v) { + options->rep.identity_as_first_hash = v; +} + +void rocksdb_cuckoo_options_set_use_module_hash( + rocksdb_cuckoo_table_options_t* options, unsigned char v) { + options->rep.use_module_hash = v; +} + +void rocksdb_options_set_cuckoo_table_factory( + rocksdb_options_t *opt, + rocksdb_cuckoo_table_options_t* table_options) { + if (table_options) { + opt->rep.table_factory.reset( + ROCKSDB_NAMESPACE::NewCuckooTableFactory(table_options->rep)); + } +} + +void rocksdb_set_options( + rocksdb_t* db, int count, const char* const keys[], const char* const values[], char** errptr) { + std::unordered_map options_map; + for (int i=0; irep->SetOptions(options_map)); + } + +void rocksdb_set_options_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* handle, int count, const char* const keys[], const char* const values[], char** errptr) { + std::unordered_map options_map; + for (int i=0; irep->SetOptions(handle->rep, options_map)); + } + +rocksdb_options_t* rocksdb_options_create() { + return new rocksdb_options_t; +} + +void rocksdb_options_destroy(rocksdb_options_t* options) { + delete options; +} + +void rocksdb_options_increase_parallelism( + rocksdb_options_t* opt, int total_threads) { + opt->rep.IncreaseParallelism(total_threads); +} + +void rocksdb_options_optimize_for_point_lookup( + rocksdb_options_t* opt, uint64_t block_cache_size_mb) { + opt->rep.OptimizeForPointLookup(block_cache_size_mb); +} + +void rocksdb_options_optimize_level_style_compaction( + rocksdb_options_t* opt, uint64_t memtable_memory_budget) { + opt->rep.OptimizeLevelStyleCompaction(memtable_memory_budget); +} + +void rocksdb_options_optimize_universal_style_compaction( + rocksdb_options_t* opt, uint64_t memtable_memory_budget) { + opt->rep.OptimizeUniversalStyleCompaction(memtable_memory_budget); +} + +void rocksdb_options_set_allow_ingest_behind( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.allow_ingest_behind = v; +} + +void rocksdb_options_set_compaction_filter( + rocksdb_options_t* opt, + rocksdb_compactionfilter_t* filter) { + opt->rep.compaction_filter = filter; +} + +void rocksdb_options_set_compaction_filter_factory( + rocksdb_options_t* opt, rocksdb_compactionfilterfactory_t* factory) { + opt->rep.compaction_filter_factory = + std::shared_ptr(factory); +} + +void rocksdb_options_compaction_readahead_size( + rocksdb_options_t* opt, size_t s) { + opt->rep.compaction_readahead_size = s; +} + +void rocksdb_options_set_comparator( + rocksdb_options_t* opt, + rocksdb_comparator_t* cmp) { + opt->rep.comparator = cmp; +} + +void rocksdb_options_set_merge_operator( + rocksdb_options_t* opt, + rocksdb_mergeoperator_t* merge_operator) { + opt->rep.merge_operator = std::shared_ptr(merge_operator); +} + + +void rocksdb_options_set_create_if_missing( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.create_if_missing = v; +} + +void rocksdb_options_set_create_missing_column_families( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.create_missing_column_families = v; +} + +void rocksdb_options_set_error_if_exists( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.error_if_exists = v; +} + +void rocksdb_options_set_paranoid_checks( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.paranoid_checks = v; +} + +void rocksdb_options_set_db_paths(rocksdb_options_t* opt, + const rocksdb_dbpath_t** dbpath_values, + size_t num_paths) { + std::vector db_paths(num_paths); + for (size_t i = 0; i < num_paths; ++i) { + db_paths[i] = dbpath_values[i]->rep; + } + opt->rep.db_paths = db_paths; +} + +void rocksdb_options_set_env(rocksdb_options_t* opt, rocksdb_env_t* env) { + opt->rep.env = (env ? env->rep : nullptr); +} + +void rocksdb_options_set_info_log(rocksdb_options_t* opt, rocksdb_logger_t* l) { + if (l) { + opt->rep.info_log = l->rep; + } +} + +void rocksdb_options_set_info_log_level( + rocksdb_options_t* opt, int v) { + opt->rep.info_log_level = static_cast(v); +} + +void rocksdb_options_set_db_write_buffer_size(rocksdb_options_t* opt, + size_t s) { + opt->rep.db_write_buffer_size = s; +} + +void rocksdb_options_set_write_buffer_size(rocksdb_options_t* opt, size_t s) { + opt->rep.write_buffer_size = s; +} + +void rocksdb_options_set_max_open_files(rocksdb_options_t* opt, int n) { + opt->rep.max_open_files = n; +} + +void rocksdb_options_set_max_file_opening_threads(rocksdb_options_t* opt, int n) { + opt->rep.max_file_opening_threads = n; +} + +void rocksdb_options_set_max_total_wal_size(rocksdb_options_t* opt, uint64_t n) { + opt->rep.max_total_wal_size = n; +} + +void rocksdb_options_set_target_file_size_base( + rocksdb_options_t* opt, uint64_t n) { + opt->rep.target_file_size_base = n; +} + +void rocksdb_options_set_target_file_size_multiplier( + rocksdb_options_t* opt, int n) { + opt->rep.target_file_size_multiplier = n; +} + +void rocksdb_options_set_max_bytes_for_level_base( + rocksdb_options_t* opt, uint64_t n) { + opt->rep.max_bytes_for_level_base = n; +} + +void rocksdb_options_set_level_compaction_dynamic_level_bytes( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.level_compaction_dynamic_level_bytes = v; +} + +void rocksdb_options_set_max_bytes_for_level_multiplier(rocksdb_options_t* opt, + double n) { + opt->rep.max_bytes_for_level_multiplier = n; +} + +void rocksdb_options_set_max_compaction_bytes(rocksdb_options_t* opt, + uint64_t n) { + opt->rep.max_compaction_bytes = n; +} + +void rocksdb_options_set_max_bytes_for_level_multiplier_additional( + rocksdb_options_t* opt, int* level_values, size_t num_levels) { + opt->rep.max_bytes_for_level_multiplier_additional.resize(num_levels); + for (size_t i = 0; i < num_levels; ++i) { + opt->rep.max_bytes_for_level_multiplier_additional[i] = level_values[i]; + } +} + +void rocksdb_options_enable_statistics(rocksdb_options_t* opt) { + opt->rep.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); +} + +void rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt, + unsigned char val) { + opt->rep.skip_stats_update_on_db_open = val; +} + +void rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open( + rocksdb_options_t* opt, unsigned char val) { + opt->rep.skip_checking_sst_file_sizes_on_db_open = val; +} + +void rocksdb_options_set_num_levels(rocksdb_options_t* opt, int n) { + opt->rep.num_levels = n; +} + +void rocksdb_options_set_level0_file_num_compaction_trigger( + rocksdb_options_t* opt, int n) { + opt->rep.level0_file_num_compaction_trigger = n; +} + +void rocksdb_options_set_level0_slowdown_writes_trigger( + rocksdb_options_t* opt, int n) { + opt->rep.level0_slowdown_writes_trigger = n; +} + +void rocksdb_options_set_level0_stop_writes_trigger( + rocksdb_options_t* opt, int n) { + opt->rep.level0_stop_writes_trigger = n; +} + +void rocksdb_options_set_max_mem_compaction_level(rocksdb_options_t* /*opt*/, + int /*n*/) {} + +void rocksdb_options_set_wal_recovery_mode(rocksdb_options_t* opt,int mode) { + opt->rep.wal_recovery_mode = static_cast(mode); +} + +void rocksdb_options_set_compression(rocksdb_options_t* opt, int t) { + opt->rep.compression = static_cast(t); +} + +void rocksdb_options_set_compression_per_level(rocksdb_options_t* opt, + int* level_values, + size_t num_levels) { + opt->rep.compression_per_level.resize(num_levels); + for (size_t i = 0; i < num_levels; ++i) { + opt->rep.compression_per_level[i] = + static_cast(level_values[i]); + } +} + +void rocksdb_options_set_bottommost_compression_options(rocksdb_options_t* opt, + int w_bits, int level, + int strategy, + int max_dict_bytes, + bool enabled) { + opt->rep.bottommost_compression_opts.window_bits = w_bits; + opt->rep.bottommost_compression_opts.level = level; + opt->rep.bottommost_compression_opts.strategy = strategy; + opt->rep.bottommost_compression_opts.max_dict_bytes = max_dict_bytes; + opt->rep.bottommost_compression_opts.enabled = enabled; +} + +void rocksdb_options_set_compression_options(rocksdb_options_t* opt, int w_bits, + int level, int strategy, + int max_dict_bytes) { + opt->rep.compression_opts.window_bits = w_bits; + opt->rep.compression_opts.level = level; + opt->rep.compression_opts.strategy = strategy; + opt->rep.compression_opts.max_dict_bytes = max_dict_bytes; +} + +void rocksdb_options_set_prefix_extractor( + rocksdb_options_t* opt, rocksdb_slicetransform_t* prefix_extractor) { + opt->rep.prefix_extractor.reset(prefix_extractor); +} + +void rocksdb_options_set_use_fsync( + rocksdb_options_t* opt, int use_fsync) { + opt->rep.use_fsync = use_fsync; +} + +void rocksdb_options_set_db_log_dir( + rocksdb_options_t* opt, const char* db_log_dir) { + opt->rep.db_log_dir = db_log_dir; +} + +void rocksdb_options_set_wal_dir( + rocksdb_options_t* opt, const char* v) { + opt->rep.wal_dir = v; +} + +void rocksdb_options_set_WAL_ttl_seconds(rocksdb_options_t* opt, uint64_t ttl) { + opt->rep.WAL_ttl_seconds = ttl; +} + +void rocksdb_options_set_WAL_size_limit_MB( + rocksdb_options_t* opt, uint64_t limit) { + opt->rep.WAL_size_limit_MB = limit; +} + +void rocksdb_options_set_manifest_preallocation_size( + rocksdb_options_t* opt, size_t v) { + opt->rep.manifest_preallocation_size = v; +} + +// noop +void rocksdb_options_set_purge_redundant_kvs_while_flush( + rocksdb_options_t* /*opt*/, unsigned char /*v*/) {} + +void rocksdb_options_set_use_direct_reads(rocksdb_options_t* opt, + unsigned char v) { + opt->rep.use_direct_reads = v; +} + +void rocksdb_options_set_use_direct_io_for_flush_and_compaction( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.use_direct_io_for_flush_and_compaction = v; +} + +void rocksdb_options_set_allow_mmap_reads( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.allow_mmap_reads = v; +} + +void rocksdb_options_set_allow_mmap_writes( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.allow_mmap_writes = v; +} + +void rocksdb_options_set_is_fd_close_on_exec( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.is_fd_close_on_exec = v; +} + +void rocksdb_options_set_skip_log_error_on_recovery( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.skip_log_error_on_recovery = v; +} + +void rocksdb_options_set_stats_dump_period_sec( + rocksdb_options_t* opt, unsigned int v) { + opt->rep.stats_dump_period_sec = v; +} + +void rocksdb_options_set_advise_random_on_open( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.advise_random_on_open = v; +} + +void rocksdb_options_set_access_hint_on_compaction_start( + rocksdb_options_t* opt, int v) { + switch(v) { + case 0: + opt->rep.access_hint_on_compaction_start = + ROCKSDB_NAMESPACE::Options::NONE; + break; + case 1: + opt->rep.access_hint_on_compaction_start = + ROCKSDB_NAMESPACE::Options::NORMAL; + break; + case 2: + opt->rep.access_hint_on_compaction_start = + ROCKSDB_NAMESPACE::Options::SEQUENTIAL; + break; + case 3: + opt->rep.access_hint_on_compaction_start = + ROCKSDB_NAMESPACE::Options::WILLNEED; + break; + } +} + +void rocksdb_options_set_use_adaptive_mutex( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.use_adaptive_mutex = v; +} + +void rocksdb_options_set_wal_bytes_per_sync( + rocksdb_options_t* opt, uint64_t v) { + opt->rep.wal_bytes_per_sync = v; +} + +void rocksdb_options_set_bytes_per_sync( + rocksdb_options_t* opt, uint64_t v) { + opt->rep.bytes_per_sync = v; +} + +void rocksdb_options_set_writable_file_max_buffer_size(rocksdb_options_t* opt, + uint64_t v) { + opt->rep.writable_file_max_buffer_size = static_cast(v); +} + +void rocksdb_options_set_allow_concurrent_memtable_write(rocksdb_options_t* opt, + unsigned char v) { + opt->rep.allow_concurrent_memtable_write = v; +} + +void rocksdb_options_set_enable_write_thread_adaptive_yield( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.enable_write_thread_adaptive_yield = v; +} + +void rocksdb_options_set_max_sequential_skip_in_iterations( + rocksdb_options_t* opt, uint64_t v) { + opt->rep.max_sequential_skip_in_iterations = v; +} + +void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t* opt, int n) { + opt->rep.max_write_buffer_number = n; +} + +void rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t* opt, int n) { + opt->rep.min_write_buffer_number_to_merge = n; +} + +void rocksdb_options_set_max_write_buffer_number_to_maintain( + rocksdb_options_t* opt, int n) { + opt->rep.max_write_buffer_number_to_maintain = n; +} + +void rocksdb_options_set_max_write_buffer_size_to_maintain( + rocksdb_options_t* opt, int64_t n) { + opt->rep.max_write_buffer_size_to_maintain = n; +} + +void rocksdb_options_set_enable_pipelined_write(rocksdb_options_t* opt, + unsigned char v) { + opt->rep.enable_pipelined_write = v; +} + +void rocksdb_options_set_unordered_write(rocksdb_options_t* opt, + unsigned char v) { + opt->rep.unordered_write = v; +} + +void rocksdb_options_set_max_subcompactions(rocksdb_options_t* opt, + uint32_t n) { + opt->rep.max_subcompactions = n; +} + +void rocksdb_options_set_max_background_jobs(rocksdb_options_t* opt, int n) { + opt->rep.max_background_jobs = n; +} + +void rocksdb_options_set_max_background_compactions(rocksdb_options_t* opt, int n) { + opt->rep.max_background_compactions = n; +} + +void rocksdb_options_set_base_background_compactions(rocksdb_options_t* opt, + int n) { + opt->rep.base_background_compactions = n; +} + +void rocksdb_options_set_max_background_flushes(rocksdb_options_t* opt, int n) { + opt->rep.max_background_flushes = n; +} + +void rocksdb_options_set_max_log_file_size(rocksdb_options_t* opt, size_t v) { + opt->rep.max_log_file_size = v; +} + +void rocksdb_options_set_log_file_time_to_roll(rocksdb_options_t* opt, size_t v) { + opt->rep.log_file_time_to_roll = v; +} + +void rocksdb_options_set_keep_log_file_num(rocksdb_options_t* opt, size_t v) { + opt->rep.keep_log_file_num = v; +} + +void rocksdb_options_set_recycle_log_file_num(rocksdb_options_t* opt, + size_t v) { + opt->rep.recycle_log_file_num = v; +} + +void rocksdb_options_set_soft_rate_limit(rocksdb_options_t* opt, double v) { + opt->rep.soft_rate_limit = v; +} + +void rocksdb_options_set_hard_rate_limit(rocksdb_options_t* opt, double v) { + opt->rep.hard_rate_limit = v; +} + +void rocksdb_options_set_soft_pending_compaction_bytes_limit(rocksdb_options_t* opt, size_t v) { + opt->rep.soft_pending_compaction_bytes_limit = v; +} + +void rocksdb_options_set_hard_pending_compaction_bytes_limit(rocksdb_options_t* opt, size_t v) { + opt->rep.hard_pending_compaction_bytes_limit = v; +} + +void rocksdb_options_set_rate_limit_delay_max_milliseconds( + rocksdb_options_t* opt, unsigned int v) { + opt->rep.rate_limit_delay_max_milliseconds = v; +} + +void rocksdb_options_set_max_manifest_file_size( + rocksdb_options_t* opt, size_t v) { + opt->rep.max_manifest_file_size = v; +} + +void rocksdb_options_set_table_cache_numshardbits( + rocksdb_options_t* opt, int v) { + opt->rep.table_cache_numshardbits = v; +} + +void rocksdb_options_set_table_cache_remove_scan_count_limit( + rocksdb_options_t* /*opt*/, int /*v*/) { + // this option is deprecated +} + +void rocksdb_options_set_arena_block_size( + rocksdb_options_t* opt, size_t v) { + opt->rep.arena_block_size = v; +} + +void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t* opt, int disable) { + opt->rep.disable_auto_compactions = disable; +} + +void rocksdb_options_set_optimize_filters_for_hits(rocksdb_options_t* opt, int v) { + opt->rep.optimize_filters_for_hits = v; +} + +void rocksdb_options_set_delete_obsolete_files_period_micros( + rocksdb_options_t* opt, uint64_t v) { + opt->rep.delete_obsolete_files_period_micros = v; +} + +void rocksdb_options_prepare_for_bulk_load(rocksdb_options_t* opt) { + opt->rep.PrepareForBulkLoad(); +} + +void rocksdb_options_set_memtable_vector_rep(rocksdb_options_t *opt) { + opt->rep.memtable_factory.reset(new ROCKSDB_NAMESPACE::VectorRepFactory); +} + +void rocksdb_options_set_memtable_prefix_bloom_size_ratio( + rocksdb_options_t* opt, double v) { + opt->rep.memtable_prefix_bloom_size_ratio = v; +} + +void rocksdb_options_set_memtable_huge_page_size(rocksdb_options_t* opt, + size_t v) { + opt->rep.memtable_huge_page_size = v; +} + +void rocksdb_options_set_hash_skip_list_rep( + rocksdb_options_t *opt, size_t bucket_count, + int32_t skiplist_height, int32_t skiplist_branching_factor) { + ROCKSDB_NAMESPACE::MemTableRepFactory* factory = + ROCKSDB_NAMESPACE::NewHashSkipListRepFactory( + bucket_count, skiplist_height, skiplist_branching_factor); + opt->rep.memtable_factory.reset(factory); +} + +void rocksdb_options_set_hash_link_list_rep( + rocksdb_options_t *opt, size_t bucket_count) { + opt->rep.memtable_factory.reset( + ROCKSDB_NAMESPACE::NewHashLinkListRepFactory(bucket_count)); +} + +void rocksdb_options_set_plain_table_factory( + rocksdb_options_t *opt, uint32_t user_key_len, int bloom_bits_per_key, + double hash_table_ratio, size_t index_sparseness) { + ROCKSDB_NAMESPACE::PlainTableOptions options; + options.user_key_len = user_key_len; + options.bloom_bits_per_key = bloom_bits_per_key; + options.hash_table_ratio = hash_table_ratio; + options.index_sparseness = index_sparseness; + + ROCKSDB_NAMESPACE::TableFactory* factory = + ROCKSDB_NAMESPACE::NewPlainTableFactory(options); + opt->rep.table_factory.reset(factory); +} + +void rocksdb_options_set_max_successive_merges( + rocksdb_options_t* opt, size_t v) { + opt->rep.max_successive_merges = v; +} + +void rocksdb_options_set_bloom_locality( + rocksdb_options_t* opt, uint32_t v) { + opt->rep.bloom_locality = v; +} + +void rocksdb_options_set_inplace_update_support( + rocksdb_options_t* opt, unsigned char v) { + opt->rep.inplace_update_support = v; +} + +void rocksdb_options_set_inplace_update_num_locks( + rocksdb_options_t* opt, size_t v) { + opt->rep.inplace_update_num_locks = v; +} + +void rocksdb_options_set_report_bg_io_stats( + rocksdb_options_t* opt, int v) { + opt->rep.report_bg_io_stats = v; +} + +void rocksdb_options_set_compaction_style(rocksdb_options_t *opt, int style) { + opt->rep.compaction_style = + static_cast(style); +} + +void rocksdb_options_set_universal_compaction_options(rocksdb_options_t *opt, rocksdb_universal_compaction_options_t *uco) { + opt->rep.compaction_options_universal = *(uco->rep); +} + +void rocksdb_options_set_fifo_compaction_options( + rocksdb_options_t* opt, + rocksdb_fifo_compaction_options_t* fifo) { + opt->rep.compaction_options_fifo = fifo->rep; +} + +char *rocksdb_options_statistics_get_string(rocksdb_options_t *opt) { + ROCKSDB_NAMESPACE::Statistics* statistics = opt->rep.statistics.get(); + if (statistics) { + return strdup(statistics->ToString().c_str()); + } + return nullptr; +} + +void rocksdb_options_set_ratelimiter(rocksdb_options_t *opt, rocksdb_ratelimiter_t *limiter) { + if (limiter) { + opt->rep.rate_limiter = limiter->rep; + } +} + +void rocksdb_options_set_atomic_flush(rocksdb_options_t* opt, + unsigned char atomic_flush) { + opt->rep.atomic_flush = atomic_flush; +} + +rocksdb_ratelimiter_t* rocksdb_ratelimiter_create( + int64_t rate_bytes_per_sec, + int64_t refill_period_us, + int32_t fairness) { + rocksdb_ratelimiter_t* rate_limiter = new rocksdb_ratelimiter_t; + rate_limiter->rep.reset( + NewGenericRateLimiter(rate_bytes_per_sec, + refill_period_us, fairness)); + return rate_limiter; +} + +void rocksdb_ratelimiter_destroy(rocksdb_ratelimiter_t *limiter) { + delete limiter; +} + +void rocksdb_options_set_row_cache(rocksdb_options_t* opt, rocksdb_cache_t* cache) { + if(cache) { + opt->rep.row_cache = cache->rep; + } +} + +void rocksdb_set_perf_level(int v) { + PerfLevel level = static_cast(v); + SetPerfLevel(level); +} + +rocksdb_perfcontext_t* rocksdb_perfcontext_create() { + rocksdb_perfcontext_t* context = new rocksdb_perfcontext_t; + context->rep = ROCKSDB_NAMESPACE::get_perf_context(); + return context; +} + +void rocksdb_perfcontext_reset(rocksdb_perfcontext_t* context) { + context->rep->Reset(); +} + +char* rocksdb_perfcontext_report(rocksdb_perfcontext_t* context, + unsigned char exclude_zero_counters) { + return strdup(context->rep->ToString(exclude_zero_counters).c_str()); +} + +uint64_t rocksdb_perfcontext_metric(rocksdb_perfcontext_t* context, + int metric) { + PerfContext* rep = context->rep; + switch (metric) { + case rocksdb_user_key_comparison_count: + return rep->user_key_comparison_count; + case rocksdb_block_cache_hit_count: + return rep->block_cache_hit_count; + case rocksdb_block_read_count: + return rep->block_read_count; + case rocksdb_block_read_byte: + return rep->block_read_byte; + case rocksdb_block_read_time: + return rep->block_read_time; + case rocksdb_block_checksum_time: + return rep->block_checksum_time; + case rocksdb_block_decompress_time: + return rep->block_decompress_time; + case rocksdb_get_read_bytes: + return rep->get_read_bytes; + case rocksdb_multiget_read_bytes: + return rep->multiget_read_bytes; + case rocksdb_iter_read_bytes: + return rep->iter_read_bytes; + case rocksdb_internal_key_skipped_count: + return rep->internal_key_skipped_count; + case rocksdb_internal_delete_skipped_count: + return rep->internal_delete_skipped_count; + case rocksdb_internal_recent_skipped_count: + return rep->internal_recent_skipped_count; + case rocksdb_internal_merge_count: + return rep->internal_merge_count; + case rocksdb_get_snapshot_time: + return rep->get_snapshot_time; + case rocksdb_get_from_memtable_time: + return rep->get_from_memtable_time; + case rocksdb_get_from_memtable_count: + return rep->get_from_memtable_count; + case rocksdb_get_post_process_time: + return rep->get_post_process_time; + case rocksdb_get_from_output_files_time: + return rep->get_from_output_files_time; + case rocksdb_seek_on_memtable_time: + return rep->seek_on_memtable_time; + case rocksdb_seek_on_memtable_count: + return rep->seek_on_memtable_count; + case rocksdb_next_on_memtable_count: + return rep->next_on_memtable_count; + case rocksdb_prev_on_memtable_count: + return rep->prev_on_memtable_count; + case rocksdb_seek_child_seek_time: + return rep->seek_child_seek_time; + case rocksdb_seek_child_seek_count: + return rep->seek_child_seek_count; + case rocksdb_seek_min_heap_time: + return rep->seek_min_heap_time; + case rocksdb_seek_max_heap_time: + return rep->seek_max_heap_time; + case rocksdb_seek_internal_seek_time: + return rep->seek_internal_seek_time; + case rocksdb_find_next_user_entry_time: + return rep->find_next_user_entry_time; + case rocksdb_write_wal_time: + return rep->write_wal_time; + case rocksdb_write_memtable_time: + return rep->write_memtable_time; + case rocksdb_write_delay_time: + return rep->write_delay_time; + case rocksdb_write_pre_and_post_process_time: + return rep->write_pre_and_post_process_time; + case rocksdb_db_mutex_lock_nanos: + return rep->db_mutex_lock_nanos; + case rocksdb_db_condition_wait_nanos: + return rep->db_condition_wait_nanos; + case rocksdb_merge_operator_time_nanos: + return rep->merge_operator_time_nanos; + case rocksdb_read_index_block_nanos: + return rep->read_index_block_nanos; + case rocksdb_read_filter_block_nanos: + return rep->read_filter_block_nanos; + case rocksdb_new_table_block_iter_nanos: + return rep->new_table_block_iter_nanos; + case rocksdb_new_table_iterator_nanos: + return rep->new_table_iterator_nanos; + case rocksdb_block_seek_nanos: + return rep->block_seek_nanos; + case rocksdb_find_table_nanos: + return rep->find_table_nanos; + case rocksdb_bloom_memtable_hit_count: + return rep->bloom_memtable_hit_count; + case rocksdb_bloom_memtable_miss_count: + return rep->bloom_memtable_miss_count; + case rocksdb_bloom_sst_hit_count: + return rep->bloom_sst_hit_count; + case rocksdb_bloom_sst_miss_count: + return rep->bloom_sst_miss_count; + case rocksdb_key_lock_wait_time: + return rep->key_lock_wait_time; + case rocksdb_key_lock_wait_count: + return rep->key_lock_wait_count; + case rocksdb_env_new_sequential_file_nanos: + return rep->env_new_sequential_file_nanos; + case rocksdb_env_new_random_access_file_nanos: + return rep->env_new_random_access_file_nanos; + case rocksdb_env_new_writable_file_nanos: + return rep->env_new_writable_file_nanos; + case rocksdb_env_reuse_writable_file_nanos: + return rep->env_reuse_writable_file_nanos; + case rocksdb_env_new_random_rw_file_nanos: + return rep->env_new_random_rw_file_nanos; + case rocksdb_env_new_directory_nanos: + return rep->env_new_directory_nanos; + case rocksdb_env_file_exists_nanos: + return rep->env_file_exists_nanos; + case rocksdb_env_get_children_nanos: + return rep->env_get_children_nanos; + case rocksdb_env_get_children_file_attributes_nanos: + return rep->env_get_children_file_attributes_nanos; + case rocksdb_env_delete_file_nanos: + return rep->env_delete_file_nanos; + case rocksdb_env_create_dir_nanos: + return rep->env_create_dir_nanos; + case rocksdb_env_create_dir_if_missing_nanos: + return rep->env_create_dir_if_missing_nanos; + case rocksdb_env_delete_dir_nanos: + return rep->env_delete_dir_nanos; + case rocksdb_env_get_file_size_nanos: + return rep->env_get_file_size_nanos; + case rocksdb_env_get_file_modification_time_nanos: + return rep->env_get_file_modification_time_nanos; + case rocksdb_env_rename_file_nanos: + return rep->env_rename_file_nanos; + case rocksdb_env_link_file_nanos: + return rep->env_link_file_nanos; + case rocksdb_env_lock_file_nanos: + return rep->env_lock_file_nanos; + case rocksdb_env_unlock_file_nanos: + return rep->env_unlock_file_nanos; + case rocksdb_env_new_logger_nanos: + return rep->env_new_logger_nanos; + default: + break; + } + return 0; +} + +void rocksdb_perfcontext_destroy(rocksdb_perfcontext_t* context) { + delete context; +} + +/* +TODO: +DB::OpenForReadOnly +DB::KeyMayExist +DB::GetOptions +DB::GetSortedWalFiles +DB::GetLatestSequenceNumber +DB::GetUpdatesSince +DB::GetDbIdentity +DB::RunManualCompaction +custom cache +table_properties_collectors +*/ + +rocksdb_compactionfilter_t* rocksdb_compactionfilter_create( + void* state, + void (*destructor)(void*), + unsigned char (*filter)( + void*, + int level, + const char* key, size_t key_length, + const char* existing_value, size_t value_length, + char** new_value, size_t *new_value_length, + unsigned char* value_changed), + const char* (*name)(void*)) { + rocksdb_compactionfilter_t* result = new rocksdb_compactionfilter_t; + result->state_ = state; + result->destructor_ = destructor; + result->filter_ = filter; + result->ignore_snapshots_ = true; + result->name_ = name; + return result; +} + +void rocksdb_compactionfilter_set_ignore_snapshots( + rocksdb_compactionfilter_t* filter, + unsigned char whether_ignore) { + filter->ignore_snapshots_ = whether_ignore; +} + +void rocksdb_compactionfilter_destroy(rocksdb_compactionfilter_t* filter) { + delete filter; +} + +unsigned char rocksdb_compactionfiltercontext_is_full_compaction( + rocksdb_compactionfiltercontext_t* context) { + return context->rep.is_full_compaction; +} + +unsigned char rocksdb_compactionfiltercontext_is_manual_compaction( + rocksdb_compactionfiltercontext_t* context) { + return context->rep.is_manual_compaction; +} + +rocksdb_compactionfilterfactory_t* rocksdb_compactionfilterfactory_create( + void* state, void (*destructor)(void*), + rocksdb_compactionfilter_t* (*create_compaction_filter)( + void*, rocksdb_compactionfiltercontext_t* context), + const char* (*name)(void*)) { + rocksdb_compactionfilterfactory_t* result = + new rocksdb_compactionfilterfactory_t; + result->state_ = state; + result->destructor_ = destructor; + result->create_compaction_filter_ = create_compaction_filter; + result->name_ = name; + return result; +} + +void rocksdb_compactionfilterfactory_destroy( + rocksdb_compactionfilterfactory_t* factory) { + delete factory; +} + +rocksdb_comparator_t* rocksdb_comparator_create( + void* state, + void (*destructor)(void*), + int (*compare)( + void*, + const char* a, size_t alen, + const char* b, size_t blen), + const char* (*name)(void*)) { + rocksdb_comparator_t* result = new rocksdb_comparator_t; + result->state_ = state; + result->destructor_ = destructor; + result->compare_ = compare; + result->name_ = name; + return result; +} + +void rocksdb_comparator_destroy(rocksdb_comparator_t* cmp) { + delete cmp; +} + +rocksdb_filterpolicy_t* rocksdb_filterpolicy_create( + void* state, + void (*destructor)(void*), + char* (*create_filter)( + void*, + const char* const* key_array, const size_t* key_length_array, + int num_keys, + size_t* filter_length), + unsigned char (*key_may_match)( + void*, + const char* key, size_t length, + const char* filter, size_t filter_length), + void (*delete_filter)( + void*, + const char* filter, size_t filter_length), + const char* (*name)(void*)) { + rocksdb_filterpolicy_t* result = new rocksdb_filterpolicy_t; + result->state_ = state; + result->destructor_ = destructor; + result->create_ = create_filter; + result->key_match_ = key_may_match; + result->delete_filter_ = delete_filter; + result->name_ = name; + return result; +} + +void rocksdb_filterpolicy_destroy(rocksdb_filterpolicy_t* filter) { + delete filter; +} + +rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom_format(int bits_per_key, bool original_format) { + // Make a rocksdb_filterpolicy_t, but override all of its methods so + // they delegate to a NewBloomFilterPolicy() instead of user + // supplied C functions. + struct Wrapper : public rocksdb_filterpolicy_t { + const FilterPolicy* rep_; + ~Wrapper() override { delete rep_; } + const char* Name() const override { return rep_->Name(); } + void CreateFilter(const Slice* keys, int n, + std::string* dst) const override { + return rep_->CreateFilter(keys, n, dst); + } + bool KeyMayMatch(const Slice& key, const Slice& filter) const override { + return rep_->KeyMayMatch(key, filter); + } + // No need to override GetFilterBitsBuilder if this one is overridden + ROCKSDB_NAMESPACE::FilterBitsBuilder* GetBuilderWithContext( + const ROCKSDB_NAMESPACE::FilterBuildingContext& context) + const override { + return rep_->GetBuilderWithContext(context); + } + ROCKSDB_NAMESPACE::FilterBitsReader* GetFilterBitsReader( + const Slice& contents) const override { + return rep_->GetFilterBitsReader(contents); + } + static void DoNothing(void*) {} + }; + Wrapper* wrapper = new Wrapper; + wrapper->rep_ = NewBloomFilterPolicy(bits_per_key, original_format); + wrapper->state_ = nullptr; + wrapper->delete_filter_ = nullptr; + wrapper->destructor_ = &Wrapper::DoNothing; + return wrapper; +} + +rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom_full(int bits_per_key) { + return rocksdb_filterpolicy_create_bloom_format(bits_per_key, false); +} + +rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom(int bits_per_key) { + return rocksdb_filterpolicy_create_bloom_format(bits_per_key, true); +} + +rocksdb_mergeoperator_t* rocksdb_mergeoperator_create( + void* state, void (*destructor)(void*), + char* (*full_merge)(void*, const char* key, size_t key_length, + const char* existing_value, + size_t existing_value_length, + const char* const* operands_list, + const size_t* operands_list_length, int num_operands, + unsigned char* success, size_t* new_value_length), + char* (*partial_merge)(void*, const char* key, size_t key_length, + const char* const* operands_list, + const size_t* operands_list_length, int num_operands, + unsigned char* success, size_t* new_value_length), + void (*delete_value)(void*, const char* value, size_t value_length), + const char* (*name)(void*)) { + rocksdb_mergeoperator_t* result = new rocksdb_mergeoperator_t; + result->state_ = state; + result->destructor_ = destructor; + result->full_merge_ = full_merge; + result->partial_merge_ = partial_merge; + result->delete_value_ = delete_value; + result->name_ = name; + return result; +} + +void rocksdb_mergeoperator_destroy(rocksdb_mergeoperator_t* merge_operator) { + delete merge_operator; +} + +rocksdb_readoptions_t* rocksdb_readoptions_create() { + return new rocksdb_readoptions_t; +} + +void rocksdb_readoptions_destroy(rocksdb_readoptions_t* opt) { + delete opt; +} + +void rocksdb_readoptions_set_verify_checksums( + rocksdb_readoptions_t* opt, + unsigned char v) { + opt->rep.verify_checksums = v; +} + +void rocksdb_readoptions_set_fill_cache( + rocksdb_readoptions_t* opt, unsigned char v) { + opt->rep.fill_cache = v; +} + +void rocksdb_readoptions_set_snapshot( + rocksdb_readoptions_t* opt, + const rocksdb_snapshot_t* snap) { + opt->rep.snapshot = (snap ? snap->rep : nullptr); +} + +void rocksdb_readoptions_set_iterate_upper_bound( + rocksdb_readoptions_t* opt, + const char* key, size_t keylen) { + if (key == nullptr) { + opt->upper_bound = Slice(); + opt->rep.iterate_upper_bound = nullptr; + + } else { + opt->upper_bound = Slice(key, keylen); + opt->rep.iterate_upper_bound = &opt->upper_bound; + } +} + +void rocksdb_readoptions_set_iterate_lower_bound( + rocksdb_readoptions_t *opt, + const char* key, size_t keylen) { + if (key == nullptr) { + opt->lower_bound = Slice(); + opt->rep.iterate_lower_bound = nullptr; + } else { + opt->lower_bound = Slice(key, keylen); + opt->rep.iterate_lower_bound = &opt->lower_bound; + } +} + +void rocksdb_readoptions_set_read_tier( + rocksdb_readoptions_t* opt, int v) { + opt->rep.read_tier = static_cast(v); +} + +void rocksdb_readoptions_set_tailing( + rocksdb_readoptions_t* opt, unsigned char v) { + opt->rep.tailing = v; +} + +void rocksdb_readoptions_set_managed( + rocksdb_readoptions_t* opt, unsigned char v) { + opt->rep.managed = v; +} + +void rocksdb_readoptions_set_readahead_size( + rocksdb_readoptions_t* opt, size_t v) { + opt->rep.readahead_size = v; +} + +void rocksdb_readoptions_set_prefix_same_as_start( + rocksdb_readoptions_t* opt, unsigned char v) { + opt->rep.prefix_same_as_start = v; +} + +void rocksdb_readoptions_set_pin_data(rocksdb_readoptions_t* opt, + unsigned char v) { + opt->rep.pin_data = v; +} + +void rocksdb_readoptions_set_total_order_seek(rocksdb_readoptions_t* opt, + unsigned char v) { + opt->rep.total_order_seek = v; +} + +void rocksdb_readoptions_set_max_skippable_internal_keys( + rocksdb_readoptions_t* opt, + uint64_t v) { + opt->rep.max_skippable_internal_keys = v; +} + +void rocksdb_readoptions_set_background_purge_on_iterator_cleanup( + rocksdb_readoptions_t* opt, unsigned char v) { + opt->rep.background_purge_on_iterator_cleanup = v; +} + +void rocksdb_readoptions_set_ignore_range_deletions( + rocksdb_readoptions_t* opt, unsigned char v) { + opt->rep.ignore_range_deletions = v; +} + +rocksdb_writeoptions_t* rocksdb_writeoptions_create() { + return new rocksdb_writeoptions_t; +} + +void rocksdb_writeoptions_destroy(rocksdb_writeoptions_t* opt) { + delete opt; +} + +void rocksdb_writeoptions_set_sync( + rocksdb_writeoptions_t* opt, unsigned char v) { + opt->rep.sync = v; +} + +void rocksdb_writeoptions_disable_WAL(rocksdb_writeoptions_t* opt, int disable) { + opt->rep.disableWAL = disable; +} + +void rocksdb_writeoptions_set_ignore_missing_column_families( + rocksdb_writeoptions_t* opt, + unsigned char v) { + opt->rep.ignore_missing_column_families = v; +} + +void rocksdb_writeoptions_set_no_slowdown( + rocksdb_writeoptions_t* opt, + unsigned char v) { + opt->rep.no_slowdown = v; +} + +void rocksdb_writeoptions_set_low_pri( + rocksdb_writeoptions_t* opt, + unsigned char v) { + opt->rep.low_pri = v; +} + +void rocksdb_writeoptions_set_memtable_insert_hint_per_batch( + rocksdb_writeoptions_t* opt, unsigned char v) { + opt->rep.memtable_insert_hint_per_batch = v; +} + +rocksdb_compactoptions_t* rocksdb_compactoptions_create() { + return new rocksdb_compactoptions_t; +} + +void rocksdb_compactoptions_destroy(rocksdb_compactoptions_t* opt) { + delete opt; +} + +void rocksdb_compactoptions_set_bottommost_level_compaction( + rocksdb_compactoptions_t* opt, unsigned char v) { + opt->rep.bottommost_level_compaction = static_cast(v); +} + +void rocksdb_compactoptions_set_exclusive_manual_compaction( + rocksdb_compactoptions_t* opt, unsigned char v) { + opt->rep.exclusive_manual_compaction = v; +} + +void rocksdb_compactoptions_set_change_level(rocksdb_compactoptions_t* opt, + unsigned char v) { + opt->rep.change_level = v; +} + +void rocksdb_compactoptions_set_target_level(rocksdb_compactoptions_t* opt, + int n) { + opt->rep.target_level = n; +} + +rocksdb_flushoptions_t* rocksdb_flushoptions_create() { + return new rocksdb_flushoptions_t; +} + +void rocksdb_flushoptions_destroy(rocksdb_flushoptions_t* opt) { + delete opt; +} + +void rocksdb_flushoptions_set_wait( + rocksdb_flushoptions_t* opt, unsigned char v) { + opt->rep.wait = v; +} + +rocksdb_cache_t* rocksdb_cache_create_lru(size_t capacity) { + rocksdb_cache_t* c = new rocksdb_cache_t; + c->rep = NewLRUCache(capacity); + return c; +} + +void rocksdb_cache_destroy(rocksdb_cache_t* cache) { + delete cache; +} + +void rocksdb_cache_set_capacity(rocksdb_cache_t* cache, size_t capacity) { + cache->rep->SetCapacity(capacity); +} + +size_t rocksdb_cache_get_usage(rocksdb_cache_t* cache) { + return cache->rep->GetUsage(); +} + +size_t rocksdb_cache_get_pinned_usage(rocksdb_cache_t* cache) { + return cache->rep->GetPinnedUsage(); +} + +rocksdb_dbpath_t* rocksdb_dbpath_create(const char* path, uint64_t target_size) { + rocksdb_dbpath_t* result = new rocksdb_dbpath_t; + result->rep.path = std::string(path); + result->rep.target_size = target_size; + return result; +} + +void rocksdb_dbpath_destroy(rocksdb_dbpath_t* dbpath) { + delete dbpath; +} + +rocksdb_env_t* rocksdb_create_default_env() { + rocksdb_env_t* result = new rocksdb_env_t; + result->rep = Env::Default(); + result->is_default = true; + return result; +} + +rocksdb_env_t* rocksdb_create_mem_env() { + rocksdb_env_t* result = new rocksdb_env_t; + result->rep = ROCKSDB_NAMESPACE::NewMemEnv(Env::Default()); + result->is_default = false; + return result; +} + +void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n) { + env->rep->SetBackgroundThreads(n); +} + +void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n) { + env->rep->SetBackgroundThreads(n, Env::HIGH); +} + +void rocksdb_env_join_all_threads(rocksdb_env_t* env) { + env->rep->WaitForJoin(); +} + +void rocksdb_env_lower_thread_pool_io_priority(rocksdb_env_t* env) { + env->rep->LowerThreadPoolIOPriority(); +} + +void rocksdb_env_lower_high_priority_thread_pool_io_priority(rocksdb_env_t* env) { + env->rep->LowerThreadPoolIOPriority(Env::HIGH); +} + +void rocksdb_env_lower_thread_pool_cpu_priority(rocksdb_env_t* env) { + env->rep->LowerThreadPoolCPUPriority(); +} + +void rocksdb_env_lower_high_priority_thread_pool_cpu_priority(rocksdb_env_t* env) { + env->rep->LowerThreadPoolCPUPriority(Env::HIGH); +} + +void rocksdb_env_destroy(rocksdb_env_t* env) { + if (!env->is_default) delete env->rep; + delete env; +} + +rocksdb_envoptions_t* rocksdb_envoptions_create() { + rocksdb_envoptions_t* opt = new rocksdb_envoptions_t; + return opt; +} + +void rocksdb_envoptions_destroy(rocksdb_envoptions_t* opt) { delete opt; } + +rocksdb_sstfilewriter_t* rocksdb_sstfilewriter_create( + const rocksdb_envoptions_t* env, const rocksdb_options_t* io_options) { + rocksdb_sstfilewriter_t* writer = new rocksdb_sstfilewriter_t; + writer->rep = new SstFileWriter(env->rep, io_options->rep); + return writer; +} + +rocksdb_sstfilewriter_t* rocksdb_sstfilewriter_create_with_comparator( + const rocksdb_envoptions_t* env, const rocksdb_options_t* io_options, + const rocksdb_comparator_t* /*comparator*/) { + rocksdb_sstfilewriter_t* writer = new rocksdb_sstfilewriter_t; + writer->rep = new SstFileWriter(env->rep, io_options->rep); + return writer; +} + +void rocksdb_sstfilewriter_open(rocksdb_sstfilewriter_t* writer, + const char* name, char** errptr) { + SaveError(errptr, writer->rep->Open(std::string(name))); +} + +void rocksdb_sstfilewriter_add(rocksdb_sstfilewriter_t* writer, const char* key, + size_t keylen, const char* val, size_t vallen, + char** errptr) { + SaveError(errptr, writer->rep->Put(Slice(key, keylen), Slice(val, vallen))); +} + +void rocksdb_sstfilewriter_put(rocksdb_sstfilewriter_t* writer, const char* key, + size_t keylen, const char* val, size_t vallen, + char** errptr) { + SaveError(errptr, writer->rep->Put(Slice(key, keylen), Slice(val, vallen))); +} + +void rocksdb_sstfilewriter_merge(rocksdb_sstfilewriter_t* writer, + const char* key, size_t keylen, + const char* val, size_t vallen, + char** errptr) { + SaveError(errptr, writer->rep->Merge(Slice(key, keylen), Slice(val, vallen))); +} + +void rocksdb_sstfilewriter_delete(rocksdb_sstfilewriter_t* writer, + const char* key, size_t keylen, + char** errptr) { + SaveError(errptr, writer->rep->Delete(Slice(key, keylen))); +} + +void rocksdb_sstfilewriter_finish(rocksdb_sstfilewriter_t* writer, + char** errptr) { + SaveError(errptr, writer->rep->Finish(nullptr)); +} + +void rocksdb_sstfilewriter_file_size(rocksdb_sstfilewriter_t* writer, + uint64_t* file_size) { + *file_size = writer->rep->FileSize(); +} + +void rocksdb_sstfilewriter_destroy(rocksdb_sstfilewriter_t* writer) { + delete writer->rep; + delete writer; +} + +rocksdb_ingestexternalfileoptions_t* +rocksdb_ingestexternalfileoptions_create() { + rocksdb_ingestexternalfileoptions_t* opt = + new rocksdb_ingestexternalfileoptions_t; + return opt; +} + +void rocksdb_ingestexternalfileoptions_set_move_files( + rocksdb_ingestexternalfileoptions_t* opt, unsigned char move_files) { + opt->rep.move_files = move_files; +} + +void rocksdb_ingestexternalfileoptions_set_snapshot_consistency( + rocksdb_ingestexternalfileoptions_t* opt, + unsigned char snapshot_consistency) { + opt->rep.snapshot_consistency = snapshot_consistency; +} + +void rocksdb_ingestexternalfileoptions_set_allow_global_seqno( + rocksdb_ingestexternalfileoptions_t* opt, + unsigned char allow_global_seqno) { + opt->rep.allow_global_seqno = allow_global_seqno; +} + +void rocksdb_ingestexternalfileoptions_set_allow_blocking_flush( + rocksdb_ingestexternalfileoptions_t* opt, + unsigned char allow_blocking_flush) { + opt->rep.allow_blocking_flush = allow_blocking_flush; +} + +void rocksdb_ingestexternalfileoptions_set_ingest_behind( + rocksdb_ingestexternalfileoptions_t* opt, + unsigned char ingest_behind) { + opt->rep.ingest_behind = ingest_behind; +} + +void rocksdb_ingestexternalfileoptions_destroy( + rocksdb_ingestexternalfileoptions_t* opt) { + delete opt; +} + +void rocksdb_ingest_external_file( + rocksdb_t* db, const char* const* file_list, const size_t list_len, + const rocksdb_ingestexternalfileoptions_t* opt, char** errptr) { + std::vector files(list_len); + for (size_t i = 0; i < list_len; ++i) { + files[i] = std::string(file_list[i]); + } + SaveError(errptr, db->rep->IngestExternalFile(files, opt->rep)); +} + +void rocksdb_ingest_external_file_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* handle, + const char* const* file_list, const size_t list_len, + const rocksdb_ingestexternalfileoptions_t* opt, char** errptr) { + std::vector files(list_len); + for (size_t i = 0; i < list_len; ++i) { + files[i] = std::string(file_list[i]); + } + SaveError(errptr, db->rep->IngestExternalFile(handle->rep, files, opt->rep)); +} + +void rocksdb_try_catch_up_with_primary(rocksdb_t* db, char** errptr) { + SaveError(errptr, db->rep->TryCatchUpWithPrimary()); +} + +rocksdb_slicetransform_t* rocksdb_slicetransform_create( + void* state, + void (*destructor)(void*), + char* (*transform)( + void*, + const char* key, size_t length, + size_t* dst_length), + unsigned char (*in_domain)( + void*, + const char* key, size_t length), + unsigned char (*in_range)( + void*, + const char* key, size_t length), + const char* (*name)(void*)) { + rocksdb_slicetransform_t* result = new rocksdb_slicetransform_t; + result->state_ = state; + result->destructor_ = destructor; + result->transform_ = transform; + result->in_domain_ = in_domain; + result->in_range_ = in_range; + result->name_ = name; + return result; +} + +void rocksdb_slicetransform_destroy(rocksdb_slicetransform_t* st) { + delete st; +} + +struct Wrapper : public rocksdb_slicetransform_t { + const SliceTransform* rep_; + ~Wrapper() override { delete rep_; } + const char* Name() const override { return rep_->Name(); } + Slice Transform(const Slice& src) const override { + return rep_->Transform(src); + } + bool InDomain(const Slice& src) const override { + return rep_->InDomain(src); + } + bool InRange(const Slice& src) const override { return rep_->InRange(src); } + static void DoNothing(void*) { } +}; + +rocksdb_slicetransform_t* rocksdb_slicetransform_create_fixed_prefix(size_t prefixLen) { + Wrapper* wrapper = new Wrapper; + wrapper->rep_ = ROCKSDB_NAMESPACE::NewFixedPrefixTransform(prefixLen); + wrapper->state_ = nullptr; + wrapper->destructor_ = &Wrapper::DoNothing; + return wrapper; +} + +rocksdb_slicetransform_t* rocksdb_slicetransform_create_noop() { + Wrapper* wrapper = new Wrapper; + wrapper->rep_ = ROCKSDB_NAMESPACE::NewNoopTransform(); + wrapper->state_ = nullptr; + wrapper->destructor_ = &Wrapper::DoNothing; + return wrapper; +} + +rocksdb_universal_compaction_options_t* rocksdb_universal_compaction_options_create() { + rocksdb_universal_compaction_options_t* result = new rocksdb_universal_compaction_options_t; + result->rep = new ROCKSDB_NAMESPACE::CompactionOptionsUniversal; + return result; +} + +void rocksdb_universal_compaction_options_set_size_ratio( + rocksdb_universal_compaction_options_t* uco, int ratio) { + uco->rep->size_ratio = ratio; +} + +void rocksdb_universal_compaction_options_set_min_merge_width( + rocksdb_universal_compaction_options_t* uco, int w) { + uco->rep->min_merge_width = w; +} + +void rocksdb_universal_compaction_options_set_max_merge_width( + rocksdb_universal_compaction_options_t* uco, int w) { + uco->rep->max_merge_width = w; +} + +void rocksdb_universal_compaction_options_set_max_size_amplification_percent( + rocksdb_universal_compaction_options_t* uco, int p) { + uco->rep->max_size_amplification_percent = p; +} + +void rocksdb_universal_compaction_options_set_compression_size_percent( + rocksdb_universal_compaction_options_t* uco, int p) { + uco->rep->compression_size_percent = p; +} + +void rocksdb_universal_compaction_options_set_stop_style( + rocksdb_universal_compaction_options_t* uco, int style) { + uco->rep->stop_style = + static_cast(style); +} + +void rocksdb_universal_compaction_options_destroy( + rocksdb_universal_compaction_options_t* uco) { + delete uco->rep; + delete uco; +} + +rocksdb_fifo_compaction_options_t* rocksdb_fifo_compaction_options_create() { + rocksdb_fifo_compaction_options_t* result = new rocksdb_fifo_compaction_options_t; + result->rep = CompactionOptionsFIFO(); + return result; +} + +void rocksdb_fifo_compaction_options_set_max_table_files_size( + rocksdb_fifo_compaction_options_t* fifo_opts, uint64_t size) { + fifo_opts->rep.max_table_files_size = size; +} + +void rocksdb_fifo_compaction_options_destroy( + rocksdb_fifo_compaction_options_t* fifo_opts) { + delete fifo_opts; +} + +void rocksdb_options_set_min_level_to_compress(rocksdb_options_t* opt, int level) { + if (level >= 0) { + assert(level <= opt->rep.num_levels); + opt->rep.compression_per_level.resize(opt->rep.num_levels); + for (int i = 0; i < level; i++) { + opt->rep.compression_per_level[i] = ROCKSDB_NAMESPACE::kNoCompression; + } + for (int i = level; i < opt->rep.num_levels; i++) { + opt->rep.compression_per_level[i] = opt->rep.compression; + } + } +} + +int rocksdb_livefiles_count( + const rocksdb_livefiles_t* lf) { + return static_cast(lf->rep.size()); +} + +const char* rocksdb_livefiles_name( + const rocksdb_livefiles_t* lf, + int index) { + return lf->rep[index].name.c_str(); +} + +int rocksdb_livefiles_level( + const rocksdb_livefiles_t* lf, + int index) { + return lf->rep[index].level; +} + +size_t rocksdb_livefiles_size( + const rocksdb_livefiles_t* lf, + int index) { + return lf->rep[index].size; +} + +const char* rocksdb_livefiles_smallestkey( + const rocksdb_livefiles_t* lf, + int index, + size_t* size) { + *size = lf->rep[index].smallestkey.size(); + return lf->rep[index].smallestkey.data(); +} + +const char* rocksdb_livefiles_largestkey( + const rocksdb_livefiles_t* lf, + int index, + size_t* size) { + *size = lf->rep[index].largestkey.size(); + return lf->rep[index].largestkey.data(); +} + +uint64_t rocksdb_livefiles_entries( + const rocksdb_livefiles_t* lf, + int index) { + return lf->rep[index].num_entries; +} + +uint64_t rocksdb_livefiles_deletions( + const rocksdb_livefiles_t* lf, + int index) { + return lf->rep[index].num_deletions; +} + +extern void rocksdb_livefiles_destroy( + const rocksdb_livefiles_t* lf) { + delete lf; +} + +void rocksdb_get_options_from_string(const rocksdb_options_t* base_options, + const char* opts_str, + rocksdb_options_t* new_options, + char** errptr) { + SaveError(errptr, + GetOptionsFromString(base_options->rep, std::string(opts_str), + &new_options->rep)); +} + +void rocksdb_delete_file_in_range(rocksdb_t* db, const char* start_key, + size_t start_key_len, const char* limit_key, + size_t limit_key_len, char** errptr) { + Slice a, b; + SaveError( + errptr, + DeleteFilesInRange( + db->rep, db->rep->DefaultColumnFamily(), + (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr), + (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr))); +} + +void rocksdb_delete_file_in_range_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + const char* start_key, size_t start_key_len, const char* limit_key, + size_t limit_key_len, char** errptr) { + Slice a, b; + SaveError( + errptr, + DeleteFilesInRange( + db->rep, column_family->rep, + (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr), + (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr))); +} + +rocksdb_transactiondb_options_t* rocksdb_transactiondb_options_create() { + return new rocksdb_transactiondb_options_t; +} + +void rocksdb_transactiondb_options_destroy(rocksdb_transactiondb_options_t* opt){ + delete opt; +} + +void rocksdb_transactiondb_options_set_max_num_locks( + rocksdb_transactiondb_options_t* opt, int64_t max_num_locks) { + opt->rep.max_num_locks = max_num_locks; +} + +void rocksdb_transactiondb_options_set_num_stripes( + rocksdb_transactiondb_options_t* opt, size_t num_stripes) { + opt->rep.num_stripes = num_stripes; +} + +void rocksdb_transactiondb_options_set_transaction_lock_timeout( + rocksdb_transactiondb_options_t* opt, int64_t txn_lock_timeout) { + opt->rep.transaction_lock_timeout = txn_lock_timeout; +} + +void rocksdb_transactiondb_options_set_default_lock_timeout( + rocksdb_transactiondb_options_t* opt, int64_t default_lock_timeout) { + opt->rep.default_lock_timeout = default_lock_timeout; +} + +rocksdb_transaction_options_t* rocksdb_transaction_options_create() { + return new rocksdb_transaction_options_t; +} + +void rocksdb_transaction_options_destroy(rocksdb_transaction_options_t* opt) { + delete opt; +} + +void rocksdb_transaction_options_set_set_snapshot( + rocksdb_transaction_options_t* opt, unsigned char v) { + opt->rep.set_snapshot = v; +} + +void rocksdb_transaction_options_set_deadlock_detect( + rocksdb_transaction_options_t* opt, unsigned char v) { + opt->rep.deadlock_detect = v; +} + +void rocksdb_transaction_options_set_lock_timeout( + rocksdb_transaction_options_t* opt, int64_t lock_timeout) { + opt->rep.lock_timeout = lock_timeout; +} + +void rocksdb_transaction_options_set_expiration( + rocksdb_transaction_options_t* opt, int64_t expiration) { + opt->rep.expiration = expiration; +} + +void rocksdb_transaction_options_set_deadlock_detect_depth( + rocksdb_transaction_options_t* opt, int64_t depth) { + opt->rep.deadlock_detect_depth = depth; +} + +void rocksdb_transaction_options_set_max_write_batch_size( + rocksdb_transaction_options_t* opt, size_t size) { + opt->rep.max_write_batch_size = size; +} + +rocksdb_optimistictransaction_options_t* +rocksdb_optimistictransaction_options_create() { + return new rocksdb_optimistictransaction_options_t; +} + +void rocksdb_optimistictransaction_options_destroy( + rocksdb_optimistictransaction_options_t* opt) { + delete opt; +} + +void rocksdb_optimistictransaction_options_set_set_snapshot( + rocksdb_optimistictransaction_options_t* opt, unsigned char v) { + opt->rep.set_snapshot = v; +} + +rocksdb_column_family_handle_t* rocksdb_transactiondb_create_column_family( + rocksdb_transactiondb_t* txn_db, + const rocksdb_options_t* column_family_options, + const char* column_family_name, char** errptr) { + rocksdb_column_family_handle_t* handle = new rocksdb_column_family_handle_t; + SaveError(errptr, txn_db->rep->CreateColumnFamily( + ColumnFamilyOptions(column_family_options->rep), + std::string(column_family_name), &(handle->rep))); + return handle; +} + +rocksdb_transactiondb_t* rocksdb_transactiondb_open( + const rocksdb_options_t* options, + const rocksdb_transactiondb_options_t* txn_db_options, const char* name, + char** errptr) { + TransactionDB* txn_db; + if (SaveError(errptr, TransactionDB::Open(options->rep, txn_db_options->rep, + std::string(name), &txn_db))) { + return nullptr; + } + rocksdb_transactiondb_t* result = new rocksdb_transactiondb_t; + result->rep = txn_db; + return result; +} + +rocksdb_transactiondb_t* rocksdb_transactiondb_open_column_families( + const rocksdb_options_t* options, + const rocksdb_transactiondb_options_t* txn_db_options, const char* name, + int num_column_families, const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, char** errptr) { + std::vector column_families; + for (int i = 0; i < num_column_families; i++) { + column_families.push_back(ColumnFamilyDescriptor( + std::string(column_family_names[i]), + ColumnFamilyOptions(column_family_options[i]->rep))); + } + + TransactionDB* txn_db; + std::vector handles; + if (SaveError(errptr, TransactionDB::Open(options->rep, txn_db_options->rep, + std::string(name), column_families, + &handles, &txn_db))) { + return nullptr; + } + + for (size_t i = 0; i < handles.size(); i++) { + rocksdb_column_family_handle_t* c_handle = + new rocksdb_column_family_handle_t; + c_handle->rep = handles[i]; + column_family_handles[i] = c_handle; + } + rocksdb_transactiondb_t* result = new rocksdb_transactiondb_t; + result->rep = txn_db; + return result; +} + +const rocksdb_snapshot_t* rocksdb_transactiondb_create_snapshot( + rocksdb_transactiondb_t* txn_db) { + rocksdb_snapshot_t* result = new rocksdb_snapshot_t; + result->rep = txn_db->rep->GetSnapshot(); + return result; +} + +void rocksdb_transactiondb_release_snapshot( + rocksdb_transactiondb_t* txn_db, const rocksdb_snapshot_t* snapshot) { + txn_db->rep->ReleaseSnapshot(snapshot->rep); + delete snapshot; +} + +rocksdb_transaction_t* rocksdb_transaction_begin( + rocksdb_transactiondb_t* txn_db, + const rocksdb_writeoptions_t* write_options, + const rocksdb_transaction_options_t* txn_options, + rocksdb_transaction_t* old_txn) { + if (old_txn == nullptr) { + rocksdb_transaction_t* result = new rocksdb_transaction_t; + result->rep = txn_db->rep->BeginTransaction(write_options->rep, + txn_options->rep, nullptr); + return result; + } + old_txn->rep = txn_db->rep->BeginTransaction(write_options->rep, + txn_options->rep, old_txn->rep); + return old_txn; +} + +void rocksdb_transaction_commit(rocksdb_transaction_t* txn, char** errptr) { + SaveError(errptr, txn->rep->Commit()); +} + +void rocksdb_transaction_rollback(rocksdb_transaction_t* txn, char** errptr) { + SaveError(errptr, txn->rep->Rollback()); +} + +void rocksdb_transaction_set_savepoint(rocksdb_transaction_t* txn) { + txn->rep->SetSavePoint(); +} + +void rocksdb_transaction_rollback_to_savepoint(rocksdb_transaction_t* txn, char** errptr) { + SaveError(errptr, txn->rep->RollbackToSavePoint()); +} + +void rocksdb_transaction_destroy(rocksdb_transaction_t* txn) { + delete txn->rep; + delete txn; +} + +const rocksdb_snapshot_t* rocksdb_transaction_get_snapshot( + rocksdb_transaction_t* txn) { + rocksdb_snapshot_t* result = new rocksdb_snapshot_t; + result->rep = txn->rep->GetSnapshot(); + return result; +} + +// Read a key inside a transaction +char* rocksdb_transaction_get(rocksdb_transaction_t* txn, + const rocksdb_readoptions_t* options, + const char* key, size_t klen, size_t* vlen, + char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = txn->rep->Get(options->rep, Slice(key, klen), &tmp); + if (s.ok()) { + *vlen = tmp.size(); + result = CopyString(tmp); + } else { + *vlen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +char* rocksdb_transaction_get_cf(rocksdb_transaction_t* txn, + const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, size_t* vlen, + char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = + txn->rep->Get(options->rep, column_family->rep, Slice(key, klen), &tmp); + if (s.ok()) { + *vlen = tmp.size(); + result = CopyString(tmp); + } else { + *vlen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +// Read a key inside a transaction +char* rocksdb_transaction_get_for_update(rocksdb_transaction_t* txn, + const rocksdb_readoptions_t* options, + const char* key, size_t klen, + size_t* vlen, unsigned char exclusive, + char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = + txn->rep->GetForUpdate(options->rep, Slice(key, klen), &tmp, exclusive); + if (s.ok()) { + *vlen = tmp.size(); + result = CopyString(tmp); + } else { + *vlen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +char* rocksdb_transaction_get_for_update_cf( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, size_t klen, + size_t* vlen, unsigned char exclusive, char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = txn->rep->GetForUpdate(options->rep, column_family->rep, + Slice(key, klen), &tmp, exclusive); + if (s.ok()) { + *vlen = tmp.size(); + result = CopyString(tmp); + } else { + *vlen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +// Read a key outside a transaction +char* rocksdb_transactiondb_get( + rocksdb_transactiondb_t* txn_db, + const rocksdb_readoptions_t* options, + const char* key, size_t klen, + size_t* vlen, + char** errptr){ + char* result = nullptr; + std::string tmp; + Status s = txn_db->rep->Get(options->rep, Slice(key, klen), &tmp); + if (s.ok()) { + *vlen = tmp.size(); + result = CopyString(tmp); + } else { + *vlen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +char* rocksdb_transactiondb_get_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, size_t* vallen, char** errptr) { + char* result = nullptr; + std::string tmp; + Status s = txn_db->rep->Get(options->rep, column_family->rep, + Slice(key, keylen), &tmp); + if (s.ok()) { + *vallen = tmp.size(); + result = CopyString(tmp); + } else { + *vallen = 0; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + } + return result; +} + +// Put a key inside a transaction +void rocksdb_transaction_put(rocksdb_transaction_t* txn, const char* key, + size_t klen, const char* val, size_t vlen, + char** errptr) { + SaveError(errptr, txn->rep->Put(Slice(key, klen), Slice(val, vlen))); +} + +void rocksdb_transaction_put_cf(rocksdb_transaction_t* txn, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* val, + size_t vlen, char** errptr) { + SaveError(errptr, txn->rep->Put(column_family->rep, Slice(key, klen), + Slice(val, vlen))); +} + +// Put a key outside a transaction +void rocksdb_transactiondb_put(rocksdb_transactiondb_t* txn_db, + const rocksdb_writeoptions_t* options, + const char* key, size_t klen, const char* val, + size_t vlen, char** errptr) { + SaveError(errptr, + txn_db->rep->Put(options->rep, Slice(key, klen), Slice(val, vlen))); +} + +void rocksdb_transactiondb_put_cf(rocksdb_transactiondb_t* txn_db, + const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t keylen, + const char* val, size_t vallen, + char** errptr) { + SaveError(errptr, txn_db->rep->Put(options->rep, column_family->rep, + Slice(key, keylen), Slice(val, vallen))); +} + +// Write batch into transaction db +void rocksdb_transactiondb_write( + rocksdb_transactiondb_t* db, + const rocksdb_writeoptions_t* options, + rocksdb_writebatch_t* batch, + char** errptr) { + SaveError(errptr, db->rep->Write(options->rep, &batch->rep)); +} + +// Merge a key inside a transaction +void rocksdb_transaction_merge(rocksdb_transaction_t* txn, const char* key, + size_t klen, const char* val, size_t vlen, + char** errptr) { + SaveError(errptr, txn->rep->Merge(Slice(key, klen), Slice(val, vlen))); +} + +void rocksdb_transaction_merge_cf(rocksdb_transaction_t* txn, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* val, + size_t vlen, char** errptr) { + SaveError(errptr, txn->rep->Merge(column_family->rep, Slice(key, klen), + Slice(val, vlen))); +} + +// Merge a key outside a transaction +void rocksdb_transactiondb_merge(rocksdb_transactiondb_t* txn_db, + const rocksdb_writeoptions_t* options, + const char* key, size_t klen, const char* val, + size_t vlen, char** errptr) { + SaveError(errptr, txn_db->rep->Merge(options->rep, Slice(key, klen), + Slice(val, vlen))); +} + +void rocksdb_transactiondb_merge_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, size_t klen, + const char* val, size_t vlen, char** errptr) { + SaveError(errptr, txn_db->rep->Merge(options->rep, column_family->rep, + Slice(key, klen), Slice(val, vlen))); +} + +// Delete a key inside a transaction +void rocksdb_transaction_delete(rocksdb_transaction_t* txn, const char* key, + size_t klen, char** errptr) { + SaveError(errptr, txn->rep->Delete(Slice(key, klen))); +} + +void rocksdb_transaction_delete_cf( + rocksdb_transaction_t* txn, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, char** errptr) { + SaveError(errptr, txn->rep->Delete(column_family->rep, Slice(key, klen))); +} + +// Delete a key outside a transaction +void rocksdb_transactiondb_delete(rocksdb_transactiondb_t* txn_db, + const rocksdb_writeoptions_t* options, + const char* key, size_t klen, char** errptr) { + SaveError(errptr, txn_db->rep->Delete(options->rep, Slice(key, klen))); +} + +void rocksdb_transactiondb_delete_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, char** errptr) { + SaveError(errptr, txn_db->rep->Delete(options->rep, column_family->rep, + Slice(key, keylen))); +} + +// Create an iterator inside a transaction +rocksdb_iterator_t* rocksdb_transaction_create_iterator( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options) { + rocksdb_iterator_t* result = new rocksdb_iterator_t; + result->rep = txn->rep->GetIterator(options->rep); + return result; +} + +// Create an iterator inside a transaction with column family +rocksdb_iterator_t* rocksdb_transaction_create_iterator_cf( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family) { + rocksdb_iterator_t* result = new rocksdb_iterator_t; + result->rep = txn->rep->GetIterator(options->rep, column_family->rep); + return result; +} + +// Create an iterator outside a transaction +rocksdb_iterator_t* rocksdb_transactiondb_create_iterator( + rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options) { + rocksdb_iterator_t* result = new rocksdb_iterator_t; + result->rep = txn_db->rep->NewIterator(options->rep); + return result; +} + +rocksdb_iterator_t* rocksdb_transactiondb_create_iterator_cf( + rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family) { + rocksdb_iterator_t* result = new rocksdb_iterator_t; + result->rep = txn_db->rep->NewIterator(options->rep, column_family->rep); + return result; +} + +void rocksdb_transactiondb_close(rocksdb_transactiondb_t* txn_db) { + delete txn_db->rep; + delete txn_db; +} + +rocksdb_checkpoint_t* rocksdb_transactiondb_checkpoint_object_create( + rocksdb_transactiondb_t* txn_db, char** errptr) { + Checkpoint* checkpoint; + if (SaveError(errptr, Checkpoint::Create(txn_db->rep, &checkpoint))) { + return nullptr; + } + rocksdb_checkpoint_t* result = new rocksdb_checkpoint_t; + result->rep = checkpoint; + return result; +} + +rocksdb_optimistictransactiondb_t* rocksdb_optimistictransactiondb_open( + const rocksdb_options_t* options, const char* name, char** errptr) { + OptimisticTransactionDB* otxn_db; + if (SaveError(errptr, OptimisticTransactionDB::Open( + options->rep, std::string(name), &otxn_db))) { + return nullptr; + } + rocksdb_optimistictransactiondb_t* result = + new rocksdb_optimistictransactiondb_t; + result->rep = otxn_db; + return result; +} + +rocksdb_optimistictransactiondb_t* +rocksdb_optimistictransactiondb_open_column_families( + const rocksdb_options_t* db_options, const char* name, + int num_column_families, const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, char** errptr) { + std::vector column_families; + for (int i = 0; i < num_column_families; i++) { + column_families.push_back(ColumnFamilyDescriptor( + std::string(column_family_names[i]), + ColumnFamilyOptions(column_family_options[i]->rep))); + } + + OptimisticTransactionDB* otxn_db; + std::vector handles; + if (SaveError(errptr, OptimisticTransactionDB::Open( + DBOptions(db_options->rep), std::string(name), + column_families, &handles, &otxn_db))) { + return nullptr; + } + + for (size_t i = 0; i < handles.size(); i++) { + rocksdb_column_family_handle_t* c_handle = + new rocksdb_column_family_handle_t; + c_handle->rep = handles[i]; + column_family_handles[i] = c_handle; + } + rocksdb_optimistictransactiondb_t* result = + new rocksdb_optimistictransactiondb_t; + result->rep = otxn_db; + return result; +} + +rocksdb_t* rocksdb_optimistictransactiondb_get_base_db( + rocksdb_optimistictransactiondb_t* otxn_db) { + DB* base_db = otxn_db->rep->GetBaseDB(); + + if (base_db != nullptr) { + rocksdb_t* result = new rocksdb_t; + result->rep = base_db; + return result; + } + + return nullptr; +} + +void rocksdb_optimistictransactiondb_close_base_db(rocksdb_t* base_db) { + delete base_db; +} + +rocksdb_transaction_t* rocksdb_optimistictransaction_begin( + rocksdb_optimistictransactiondb_t* otxn_db, + const rocksdb_writeoptions_t* write_options, + const rocksdb_optimistictransaction_options_t* otxn_options, + rocksdb_transaction_t* old_txn) { + if (old_txn == nullptr) { + rocksdb_transaction_t* result = new rocksdb_transaction_t; + result->rep = otxn_db->rep->BeginTransaction(write_options->rep, + otxn_options->rep, nullptr); + return result; + } + old_txn->rep = otxn_db->rep->BeginTransaction( + write_options->rep, otxn_options->rep, old_txn->rep); + return old_txn; +} + +void rocksdb_optimistictransactiondb_close( + rocksdb_optimistictransactiondb_t* otxn_db) { + delete otxn_db->rep; + delete otxn_db; +} + +void rocksdb_free(void* ptr) { free(ptr); } + +rocksdb_pinnableslice_t* rocksdb_get_pinned( + rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key, + size_t keylen, char** errptr) { + rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t); + Status s = db->rep->Get(options->rep, db->rep->DefaultColumnFamily(), + Slice(key, keylen), &v->rep); + if (!s.ok()) { + delete (v); + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + return nullptr; + } + return v; +} + +rocksdb_pinnableslice_t* rocksdb_get_pinned_cf( + rocksdb_t* db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, char** errptr) { + rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t); + Status s = db->rep->Get(options->rep, column_family->rep, Slice(key, keylen), + &v->rep); + if (!s.ok()) { + delete v; + if (!s.IsNotFound()) { + SaveError(errptr, s); + } + return nullptr; + } + return v; +} + +void rocksdb_pinnableslice_destroy(rocksdb_pinnableslice_t* v) { delete v; } + +const char* rocksdb_pinnableslice_value(const rocksdb_pinnableslice_t* v, + size_t* vlen) { + if (!v) { + *vlen = 0; + return nullptr; + } + + *vlen = v->rep.size(); + return v->rep.data(); +} + +// container to keep databases and caches in order to use +// ROCKSDB_NAMESPACE::MemoryUtil +struct rocksdb_memory_consumers_t { + std::vector dbs; + std::unordered_set caches; +}; + +// initializes new container of memory consumers +rocksdb_memory_consumers_t* rocksdb_memory_consumers_create() { + return new rocksdb_memory_consumers_t; +} + +// adds datatabase to the container of memory consumers +void rocksdb_memory_consumers_add_db(rocksdb_memory_consumers_t* consumers, + rocksdb_t* db) { + consumers->dbs.push_back(db); +} + +// adds cache to the container of memory consumers +void rocksdb_memory_consumers_add_cache(rocksdb_memory_consumers_t* consumers, + rocksdb_cache_t* cache) { + consumers->caches.insert(cache); +} + +// deletes container with memory consumers +void rocksdb_memory_consumers_destroy(rocksdb_memory_consumers_t* consumers) { + delete consumers; +} + +// contains memory usage statistics provided by ROCKSDB_NAMESPACE::MemoryUtil +struct rocksdb_memory_usage_t { + uint64_t mem_table_total; + uint64_t mem_table_unflushed; + uint64_t mem_table_readers_total; + uint64_t cache_total; +}; + +// estimates amount of memory occupied by consumers (dbs and caches) +rocksdb_memory_usage_t* rocksdb_approximate_memory_usage_create( + rocksdb_memory_consumers_t* consumers, char** errptr) { + + vector dbs; + for (auto db : consumers->dbs) { + dbs.push_back(db->rep); + } + + unordered_set cache_set; + for (auto cache : consumers->caches) { + cache_set.insert(const_cast(cache->rep.get())); + } + + std::map usage_by_type; + + auto status = MemoryUtil::GetApproximateMemoryUsageByType(dbs, cache_set, + &usage_by_type); + if (SaveError(errptr, status)) { + return nullptr; + } + + auto result = new rocksdb_memory_usage_t; + result->mem_table_total = usage_by_type[MemoryUtil::kMemTableTotal]; + result->mem_table_unflushed = usage_by_type[MemoryUtil::kMemTableUnFlushed]; + result->mem_table_readers_total = usage_by_type[MemoryUtil::kTableReadersTotal]; + result->cache_total = usage_by_type[MemoryUtil::kCacheTotal]; + return result; +} + +uint64_t rocksdb_approximate_memory_usage_get_mem_table_total( + rocksdb_memory_usage_t* memory_usage) { + return memory_usage->mem_table_total; +} + +uint64_t rocksdb_approximate_memory_usage_get_mem_table_unflushed( + rocksdb_memory_usage_t* memory_usage) { + return memory_usage->mem_table_unflushed; +} + +uint64_t rocksdb_approximate_memory_usage_get_mem_table_readers_total( + rocksdb_memory_usage_t* memory_usage) { + return memory_usage->mem_table_readers_total; +} + +uint64_t rocksdb_approximate_memory_usage_get_cache_total( + rocksdb_memory_usage_t* memory_usage) { + return memory_usage->cache_total; +} + +// deletes container with memory usage estimates +void rocksdb_approximate_memory_usage_destroy(rocksdb_memory_usage_t* usage) { + delete usage; +} + +} // end extern "C" + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/c_test.c b/src/rocksdb/db/c_test.c new file mode 100644 index 000000000..cf2e266f9 --- /dev/null +++ b/src/rocksdb/db/c_test.c @@ -0,0 +1,1866 @@ +/* Copyright (c) 2011 The LevelDB Authors. All rights reserved. + Use of this source code is governed by a BSD-style license that can be + found in the LICENSE file. See the AUTHORS file for names of contributors. */ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +#include + +#ifndef ROCKSDB_LITE // Lite does not support C API + +#include "rocksdb/c.h" + +#include +#include +#include +#include +#ifndef OS_WIN +#include +#endif +#include + +// Can not use port/port.h macros as this is a c file +#ifdef OS_WIN +#include + +// Ok for uniqueness +int geteuid() { + int result = 0; + + result = ((int)GetCurrentProcessId() << 16); + result |= (int)GetCurrentThreadId(); + + return result; +} + +// VS < 2015 +#if defined(_MSC_VER) && (_MSC_VER < 1900) +#define snprintf _snprintf +#endif + +#endif + +const char* phase = ""; +static char dbname[200]; +static char sstfilename[200]; +static char dbbackupname[200]; +static char dbcheckpointname[200]; +static char dbpathname[200]; +static char secondary_path[200]; + +static void StartPhase(const char* name) { + fprintf(stderr, "=== Test %s\n", name); + phase = name; +} +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning (disable: 4996) // getenv security warning +#endif +static const char* GetTempDir(void) { + const char* ret = getenv("TEST_TMPDIR"); + if (ret == NULL || ret[0] == '\0') + ret = "/tmp"; + return ret; +} +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +#define CheckNoError(err) \ + if ((err) != NULL) { \ + fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, (err)); \ + abort(); \ + } + +#define CheckCondition(cond) \ + if (!(cond)) { \ + fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, #cond); \ + abort(); \ + } + +static void CheckEqual(const char* expected, const char* v, size_t n) { + if (expected == NULL && v == NULL) { + // ok + } else if (expected != NULL && v != NULL && n == strlen(expected) && + memcmp(expected, v, n) == 0) { + // ok + return; + } else { + fprintf(stderr, "%s: expected '%s', got '%s'\n", + phase, + (expected ? expected : "(null)"), + (v ? v : "(null")); + abort(); + } +} + +static void Free(char** ptr) { + if (*ptr) { + free(*ptr); + *ptr = NULL; + } +} + +static void CheckValue( + char* err, + const char* expected, + char** actual, + size_t actual_length) { + CheckNoError(err); + CheckEqual(expected, *actual, actual_length); + Free(actual); +} + +static void CheckGet( + rocksdb_t* db, + const rocksdb_readoptions_t* options, + const char* key, + const char* expected) { + char* err = NULL; + size_t val_len; + char* val; + val = rocksdb_get(db, options, key, strlen(key), &val_len, &err); + CheckNoError(err); + CheckEqual(expected, val, val_len); + Free(&val); +} + +static void CheckGetCF( + rocksdb_t* db, + const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* handle, + const char* key, + const char* expected) { + char* err = NULL; + size_t val_len; + char* val; + val = rocksdb_get_cf(db, options, handle, key, strlen(key), &val_len, &err); + CheckNoError(err); + CheckEqual(expected, val, val_len); + Free(&val); +} + +static void CheckPinGet(rocksdb_t* db, const rocksdb_readoptions_t* options, + const char* key, const char* expected) { + char* err = NULL; + size_t val_len; + const char* val; + rocksdb_pinnableslice_t* p; + p = rocksdb_get_pinned(db, options, key, strlen(key), &err); + CheckNoError(err); + val = rocksdb_pinnableslice_value(p, &val_len); + CheckEqual(expected, val, val_len); + rocksdb_pinnableslice_destroy(p); +} + +static void CheckPinGetCF(rocksdb_t* db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* handle, + const char* key, const char* expected) { + char* err = NULL; + size_t val_len; + const char* val; + rocksdb_pinnableslice_t* p; + p = rocksdb_get_pinned_cf(db, options, handle, key, strlen(key), &err); + CheckNoError(err); + val = rocksdb_pinnableslice_value(p, &val_len); + CheckEqual(expected, val, val_len); + rocksdb_pinnableslice_destroy(p); +} + +static void CheckIter(rocksdb_iterator_t* iter, + const char* key, const char* val) { + size_t len; + const char* str; + str = rocksdb_iter_key(iter, &len); + CheckEqual(key, str, len); + str = rocksdb_iter_value(iter, &len); + CheckEqual(val, str, len); +} + +// Callback from rocksdb_writebatch_iterate() +static void CheckPut(void* ptr, + const char* k, size_t klen, + const char* v, size_t vlen) { + int* state = (int*) ptr; + CheckCondition(*state < 2); + switch (*state) { + case 0: + CheckEqual("bar", k, klen); + CheckEqual("b", v, vlen); + break; + case 1: + CheckEqual("box", k, klen); + CheckEqual("c", v, vlen); + break; + } + (*state)++; +} + +// Callback from rocksdb_writebatch_iterate() +static void CheckDel(void* ptr, const char* k, size_t klen) { + int* state = (int*) ptr; + CheckCondition(*state == 2); + CheckEqual("bar", k, klen); + (*state)++; +} + +static void CmpDestroy(void* arg) { (void)arg; } + +static int CmpCompare(void* arg, const char* a, size_t alen, + const char* b, size_t blen) { + (void)arg; + size_t n = (alen < blen) ? alen : blen; + int r = memcmp(a, b, n); + if (r == 0) { + if (alen < blen) r = -1; + else if (alen > blen) r = +1; + } + return r; +} + +static const char* CmpName(void* arg) { + (void)arg; + return "foo"; +} + +// Custom filter policy +static unsigned char fake_filter_result = 1; +static void FilterDestroy(void* arg) { (void)arg; } +static const char* FilterName(void* arg) { + (void)arg; + return "TestFilter"; +} +static char* FilterCreate( + void* arg, + const char* const* key_array, const size_t* key_length_array, + int num_keys, + size_t* filter_length) { + (void)arg; + (void)key_array; + (void)key_length_array; + (void)num_keys; + *filter_length = 4; + char* result = malloc(4); + memcpy(result, "fake", 4); + return result; +} +static unsigned char FilterKeyMatch( + void* arg, + const char* key, size_t length, + const char* filter, size_t filter_length) { + (void)arg; + (void)key; + (void)length; + CheckCondition(filter_length == 4); + CheckCondition(memcmp(filter, "fake", 4) == 0); + return fake_filter_result; +} + +// Custom compaction filter +static void CFilterDestroy(void* arg) { (void)arg; } +static const char* CFilterName(void* arg) { + (void)arg; + return "foo"; +} +static unsigned char CFilterFilter(void* arg, int level, const char* key, + size_t key_length, + const char* existing_value, + size_t value_length, char** new_value, + size_t* new_value_length, + unsigned char* value_changed) { + (void)arg; + (void)level; + (void)existing_value; + (void)value_length; + if (key_length == 3) { + if (memcmp(key, "bar", key_length) == 0) { + return 1; + } else if (memcmp(key, "baz", key_length) == 0) { + *value_changed = 1; + *new_value = "newbazvalue"; + *new_value_length = 11; + return 0; + } + } + return 0; +} + +static void CFilterFactoryDestroy(void* arg) { (void)arg; } +static const char* CFilterFactoryName(void* arg) { + (void)arg; + return "foo"; +} +static rocksdb_compactionfilter_t* CFilterCreate( + void* arg, rocksdb_compactionfiltercontext_t* context) { + (void)arg; + (void)context; + return rocksdb_compactionfilter_create(NULL, CFilterDestroy, CFilterFilter, + CFilterName); +} + +static rocksdb_t* CheckCompaction(rocksdb_t* db, rocksdb_options_t* options, + rocksdb_readoptions_t* roptions, + rocksdb_writeoptions_t* woptions) { + char* err = NULL; + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err); + CheckNoError(err); + CheckGet(db, roptions, "foo", "foovalue"); + rocksdb_put(db, woptions, "bar", 3, "barvalue", 8, &err); + CheckNoError(err); + CheckGet(db, roptions, "bar", "barvalue"); + rocksdb_put(db, woptions, "baz", 3, "bazvalue", 8, &err); + CheckNoError(err); + CheckGet(db, roptions, "baz", "bazvalue"); + + // Force compaction + rocksdb_compact_range(db, NULL, 0, NULL, 0); + // should have filtered bar, but not foo + CheckGet(db, roptions, "foo", "foovalue"); + CheckGet(db, roptions, "bar", NULL); + CheckGet(db, roptions, "baz", "newbazvalue"); + return db; +} + +// Custom merge operator +static void MergeOperatorDestroy(void* arg) { (void)arg; } +static const char* MergeOperatorName(void* arg) { + (void)arg; + return "TestMergeOperator"; +} +static char* MergeOperatorFullMerge( + void* arg, + const char* key, size_t key_length, + const char* existing_value, size_t existing_value_length, + const char* const* operands_list, const size_t* operands_list_length, + int num_operands, + unsigned char* success, size_t* new_value_length) { + (void)arg; + (void)key; + (void)key_length; + (void)existing_value; + (void)existing_value_length; + (void)operands_list; + (void)operands_list_length; + (void)num_operands; + *new_value_length = 4; + *success = 1; + char* result = malloc(4); + memcpy(result, "fake", 4); + return result; +} +static char* MergeOperatorPartialMerge( + void* arg, + const char* key, size_t key_length, + const char* const* operands_list, const size_t* operands_list_length, + int num_operands, + unsigned char* success, size_t* new_value_length) { + (void)arg; + (void)key; + (void)key_length; + (void)operands_list; + (void)operands_list_length; + (void)num_operands; + *new_value_length = 4; + *success = 1; + char* result = malloc(4); + memcpy(result, "fake", 4); + return result; +} + +static void CheckTxnGet( + rocksdb_transaction_t* txn, + const rocksdb_readoptions_t* options, + const char* key, + const char* expected) { + char* err = NULL; + size_t val_len; + char* val; + val = rocksdb_transaction_get(txn, options, key, strlen(key), &val_len, &err); + CheckNoError(err); + CheckEqual(expected, val, val_len); + Free(&val); +} + +static void CheckTxnGetCF(rocksdb_transaction_t* txn, + const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, const char* expected) { + char* err = NULL; + size_t val_len; + char* val; + val = rocksdb_transaction_get_cf(txn, options, column_family, key, + strlen(key), &val_len, &err); + CheckNoError(err); + CheckEqual(expected, val, val_len); + Free(&val); +} + +static void CheckTxnDBGet( + rocksdb_transactiondb_t* txn_db, + const rocksdb_readoptions_t* options, + const char* key, + const char* expected) { + char* err = NULL; + size_t val_len; + char* val; + val = rocksdb_transactiondb_get(txn_db, options, key, strlen(key), &val_len, &err); + CheckNoError(err); + CheckEqual(expected, val, val_len); + Free(&val); +} + +static void CheckTxnDBGetCF(rocksdb_transactiondb_t* txn_db, + const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, const char* expected) { + char* err = NULL; + size_t val_len; + char* val; + val = rocksdb_transactiondb_get_cf(txn_db, options, column_family, key, + strlen(key), &val_len, &err); + CheckNoError(err); + CheckEqual(expected, val, val_len); + Free(&val); +} + +int main(int argc, char** argv) { + (void)argc; + (void)argv; + rocksdb_t* db; + rocksdb_comparator_t* cmp; + rocksdb_cache_t* cache; + rocksdb_dbpath_t *dbpath; + rocksdb_env_t* env; + rocksdb_options_t* options; + rocksdb_compactoptions_t* coptions; + rocksdb_block_based_table_options_t* table_options; + rocksdb_readoptions_t* roptions; + rocksdb_writeoptions_t* woptions; + rocksdb_ratelimiter_t* rate_limiter; + rocksdb_transactiondb_t* txn_db; + rocksdb_transactiondb_options_t* txn_db_options; + rocksdb_transaction_t* txn; + rocksdb_transaction_options_t* txn_options; + rocksdb_optimistictransactiondb_t* otxn_db; + rocksdb_optimistictransaction_options_t* otxn_options; + char* err = NULL; + int run = -1; + + snprintf(dbname, sizeof(dbname), + "%s/rocksdb_c_test-%d", + GetTempDir(), + ((int) geteuid())); + + snprintf(dbbackupname, sizeof(dbbackupname), + "%s/rocksdb_c_test-%d-backup", + GetTempDir(), + ((int) geteuid())); + + snprintf(dbcheckpointname, sizeof(dbcheckpointname), + "%s/rocksdb_c_test-%d-checkpoint", + GetTempDir(), + ((int) geteuid())); + + snprintf(sstfilename, sizeof(sstfilename), + "%s/rocksdb_c_test-%d-sst", + GetTempDir(), + ((int)geteuid())); + + snprintf(dbpathname, sizeof(dbpathname), + "%s/rocksdb_c_test-%d-dbpath", + GetTempDir(), + ((int) geteuid())); + + StartPhase("create_objects"); + cmp = rocksdb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName); + dbpath = rocksdb_dbpath_create(dbpathname, 1024 * 1024); + env = rocksdb_create_default_env(); + cache = rocksdb_cache_create_lru(100000); + + options = rocksdb_options_create(); + rocksdb_options_set_comparator(options, cmp); + rocksdb_options_set_error_if_exists(options, 1); + rocksdb_options_set_env(options, env); + rocksdb_options_set_info_log(options, NULL); + rocksdb_options_set_write_buffer_size(options, 100000); + rocksdb_options_set_paranoid_checks(options, 1); + rocksdb_options_set_max_open_files(options, 10); + rocksdb_options_set_base_background_compactions(options, 1); + + table_options = rocksdb_block_based_options_create(); + rocksdb_block_based_options_set_block_cache(table_options, cache); + rocksdb_block_based_options_set_data_block_index_type(table_options, 1); + rocksdb_block_based_options_set_data_block_hash_ratio(table_options, 0.75); + rocksdb_options_set_block_based_table_factory(options, table_options); + + rocksdb_options_set_compression(options, rocksdb_no_compression); + rocksdb_options_set_compression_options(options, -14, -1, 0, 0); + int compression_levels[] = {rocksdb_no_compression, rocksdb_no_compression, + rocksdb_no_compression, rocksdb_no_compression}; + rocksdb_options_set_compression_per_level(options, compression_levels, 4); + rate_limiter = rocksdb_ratelimiter_create(1000 * 1024 * 1024, 100 * 1000, 10); + rocksdb_options_set_ratelimiter(options, rate_limiter); + rocksdb_ratelimiter_destroy(rate_limiter); + + roptions = rocksdb_readoptions_create(); + rocksdb_readoptions_set_verify_checksums(roptions, 1); + rocksdb_readoptions_set_fill_cache(roptions, 1); + + woptions = rocksdb_writeoptions_create(); + rocksdb_writeoptions_set_sync(woptions, 1); + + coptions = rocksdb_compactoptions_create(); + rocksdb_compactoptions_set_exclusive_manual_compaction(coptions, 1); + + StartPhase("destroy"); + rocksdb_destroy_db(options, dbname, &err); + Free(&err); + + StartPhase("open_error"); + rocksdb_open(options, dbname, &err); + CheckCondition(err != NULL); + Free(&err); + + StartPhase("open"); + rocksdb_options_set_create_if_missing(options, 1); + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + CheckGet(db, roptions, "foo", NULL); + + StartPhase("put"); + rocksdb_put(db, woptions, "foo", 3, "hello", 5, &err); + CheckNoError(err); + CheckGet(db, roptions, "foo", "hello"); + + StartPhase("backup_and_restore"); + { + rocksdb_destroy_db(options, dbbackupname, &err); + CheckNoError(err); + + rocksdb_backup_engine_t *be = rocksdb_backup_engine_open(options, dbbackupname, &err); + CheckNoError(err); + + rocksdb_backup_engine_create_new_backup(be, db, &err); + CheckNoError(err); + + // need a change to trigger a new backup + rocksdb_delete(db, woptions, "does-not-exist", 14, &err); + CheckNoError(err); + + rocksdb_backup_engine_create_new_backup(be, db, &err); + CheckNoError(err); + + const rocksdb_backup_engine_info_t* bei = rocksdb_backup_engine_get_backup_info(be); + CheckCondition(rocksdb_backup_engine_info_count(bei) > 1); + rocksdb_backup_engine_info_destroy(bei); + + rocksdb_backup_engine_purge_old_backups(be, 1, &err); + CheckNoError(err); + + bei = rocksdb_backup_engine_get_backup_info(be); + CheckCondition(rocksdb_backup_engine_info_count(bei) == 1); + rocksdb_backup_engine_info_destroy(bei); + + rocksdb_delete(db, woptions, "foo", 3, &err); + CheckNoError(err); + + rocksdb_close(db); + + rocksdb_destroy_db(options, dbname, &err); + CheckNoError(err); + + rocksdb_restore_options_t *restore_options = rocksdb_restore_options_create(); + rocksdb_restore_options_set_keep_log_files(restore_options, 0); + rocksdb_backup_engine_restore_db_from_latest_backup(be, dbname, dbname, restore_options, &err); + CheckNoError(err); + rocksdb_restore_options_destroy(restore_options); + + rocksdb_options_set_error_if_exists(options, 0); + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + rocksdb_options_set_error_if_exists(options, 1); + + CheckGet(db, roptions, "foo", "hello"); + + rocksdb_backup_engine_close(be); + } + + StartPhase("checkpoint"); + { + rocksdb_destroy_db(options, dbcheckpointname, &err); + CheckNoError(err); + + rocksdb_checkpoint_t* checkpoint = rocksdb_checkpoint_object_create(db, &err); + CheckNoError(err); + + rocksdb_checkpoint_create(checkpoint, dbcheckpointname, 0, &err); + CheckNoError(err); + + // start a new database from the checkpoint + rocksdb_close(db); + rocksdb_options_set_error_if_exists(options, 0); + db = rocksdb_open(options, dbcheckpointname, &err); + CheckNoError(err); + + CheckGet(db, roptions, "foo", "hello"); + + rocksdb_checkpoint_object_destroy(checkpoint); + + rocksdb_close(db); + rocksdb_destroy_db(options, dbcheckpointname, &err); + CheckNoError(err); + + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + rocksdb_options_set_error_if_exists(options, 1); + } + + StartPhase("compactall"); + rocksdb_compact_range(db, NULL, 0, NULL, 0); + CheckGet(db, roptions, "foo", "hello"); + + StartPhase("compactrange"); + rocksdb_compact_range(db, "a", 1, "z", 1); + CheckGet(db, roptions, "foo", "hello"); + + StartPhase("compactallopt"); + rocksdb_compact_range_opt(db, coptions, NULL, 0, NULL, 0); + CheckGet(db, roptions, "foo", "hello"); + + StartPhase("compactrangeopt"); + rocksdb_compact_range_opt(db, coptions, "a", 1, "z", 1); + CheckGet(db, roptions, "foo", "hello"); + + // Simple check cache usage + StartPhase("cache_usage"); + { + rocksdb_readoptions_set_pin_data(roptions, 1); + rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions); + rocksdb_iter_seek(iter, "foo", 3); + + size_t usage = rocksdb_cache_get_usage(cache); + CheckCondition(usage > 0); + + size_t pin_usage = rocksdb_cache_get_pinned_usage(cache); + CheckCondition(pin_usage > 0); + + rocksdb_iter_next(iter); + rocksdb_iter_destroy(iter); + rocksdb_readoptions_set_pin_data(roptions, 0); + } + + StartPhase("addfile"); + { + rocksdb_envoptions_t* env_opt = rocksdb_envoptions_create(); + rocksdb_options_t* io_options = rocksdb_options_create(); + rocksdb_sstfilewriter_t* writer = + rocksdb_sstfilewriter_create(env_opt, io_options); + + remove(sstfilename); + rocksdb_sstfilewriter_open(writer, sstfilename, &err); + CheckNoError(err); + rocksdb_sstfilewriter_put(writer, "sstk1", 5, "v1", 2, &err); + CheckNoError(err); + rocksdb_sstfilewriter_put(writer, "sstk2", 5, "v2", 2, &err); + CheckNoError(err); + rocksdb_sstfilewriter_put(writer, "sstk3", 5, "v3", 2, &err); + CheckNoError(err); + rocksdb_sstfilewriter_finish(writer, &err); + CheckNoError(err); + + rocksdb_ingestexternalfileoptions_t* ing_opt = + rocksdb_ingestexternalfileoptions_create(); + const char* file_list[1] = {sstfilename}; + rocksdb_ingest_external_file(db, file_list, 1, ing_opt, &err); + CheckNoError(err); + CheckGet(db, roptions, "sstk1", "v1"); + CheckGet(db, roptions, "sstk2", "v2"); + CheckGet(db, roptions, "sstk3", "v3"); + + remove(sstfilename); + rocksdb_sstfilewriter_open(writer, sstfilename, &err); + CheckNoError(err); + rocksdb_sstfilewriter_put(writer, "sstk2", 5, "v4", 2, &err); + CheckNoError(err); + rocksdb_sstfilewriter_put(writer, "sstk22", 6, "v5", 2, &err); + CheckNoError(err); + rocksdb_sstfilewriter_put(writer, "sstk3", 5, "v6", 2, &err); + CheckNoError(err); + rocksdb_sstfilewriter_finish(writer, &err); + CheckNoError(err); + + rocksdb_ingest_external_file(db, file_list, 1, ing_opt, &err); + CheckNoError(err); + CheckGet(db, roptions, "sstk1", "v1"); + CheckGet(db, roptions, "sstk2", "v4"); + CheckGet(db, roptions, "sstk22", "v5"); + CheckGet(db, roptions, "sstk3", "v6"); + + rocksdb_ingestexternalfileoptions_destroy(ing_opt); + rocksdb_sstfilewriter_destroy(writer); + rocksdb_options_destroy(io_options); + rocksdb_envoptions_destroy(env_opt); + + // Delete all keys we just ingested + rocksdb_delete(db, woptions, "sstk1", 5, &err); + CheckNoError(err); + rocksdb_delete(db, woptions, "sstk2", 5, &err); + CheckNoError(err); + rocksdb_delete(db, woptions, "sstk22", 6, &err); + CheckNoError(err); + rocksdb_delete(db, woptions, "sstk3", 5, &err); + CheckNoError(err); + } + + StartPhase("writebatch"); + { + rocksdb_writebatch_t* wb = rocksdb_writebatch_create(); + rocksdb_writebatch_put(wb, "foo", 3, "a", 1); + rocksdb_writebatch_clear(wb); + rocksdb_writebatch_put(wb, "bar", 3, "b", 1); + rocksdb_writebatch_put(wb, "box", 3, "c", 1); + rocksdb_writebatch_delete(wb, "bar", 3); + rocksdb_write(db, woptions, wb, &err); + CheckNoError(err); + CheckGet(db, roptions, "foo", "hello"); + CheckGet(db, roptions, "bar", NULL); + CheckGet(db, roptions, "box", "c"); + int pos = 0; + rocksdb_writebatch_iterate(wb, &pos, CheckPut, CheckDel); + CheckCondition(pos == 3); + rocksdb_writebatch_clear(wb); + rocksdb_writebatch_put(wb, "bar", 3, "b", 1); + rocksdb_writebatch_put(wb, "bay", 3, "d", 1); + rocksdb_writebatch_delete_range(wb, "bar", 3, "bay", 3); + rocksdb_write(db, woptions, wb, &err); + CheckNoError(err); + CheckGet(db, roptions, "bar", NULL); + CheckGet(db, roptions, "bay", "d"); + rocksdb_writebatch_clear(wb); + const char* start_list[1] = {"bay"}; + const size_t start_sizes[1] = {3}; + const char* end_list[1] = {"baz"}; + const size_t end_sizes[1] = {3}; + rocksdb_writebatch_delete_rangev(wb, 1, start_list, start_sizes, end_list, + end_sizes); + rocksdb_write(db, woptions, wb, &err); + CheckNoError(err); + CheckGet(db, roptions, "bay", NULL); + rocksdb_writebatch_destroy(wb); + } + + StartPhase("writebatch_vectors"); + { + rocksdb_writebatch_t* wb = rocksdb_writebatch_create(); + const char* k_list[2] = { "z", "ap" }; + const size_t k_sizes[2] = { 1, 2 }; + const char* v_list[3] = { "x", "y", "z" }; + const size_t v_sizes[3] = { 1, 1, 1 }; + rocksdb_writebatch_putv(wb, 2, k_list, k_sizes, 3, v_list, v_sizes); + rocksdb_write(db, woptions, wb, &err); + CheckNoError(err); + CheckGet(db, roptions, "zap", "xyz"); + rocksdb_writebatch_delete(wb, "zap", 3); + rocksdb_write(db, woptions, wb, &err); + CheckNoError(err); + CheckGet(db, roptions, "zap", NULL); + rocksdb_writebatch_destroy(wb); + } + + StartPhase("writebatch_savepoint"); + { + rocksdb_writebatch_t* wb = rocksdb_writebatch_create(); + rocksdb_writebatch_set_save_point(wb); + rocksdb_writebatch_set_save_point(wb); + const char* k_list[2] = {"z", "ap"}; + const size_t k_sizes[2] = {1, 2}; + const char* v_list[3] = {"x", "y", "z"}; + const size_t v_sizes[3] = {1, 1, 1}; + rocksdb_writebatch_pop_save_point(wb, &err); + CheckNoError(err); + rocksdb_writebatch_putv(wb, 2, k_list, k_sizes, 3, v_list, v_sizes); + rocksdb_writebatch_rollback_to_save_point(wb, &err); + CheckNoError(err); + rocksdb_write(db, woptions, wb, &err); + CheckNoError(err); + CheckGet(db, roptions, "zap", NULL); + rocksdb_writebatch_destroy(wb); + } + + StartPhase("writebatch_rep"); + { + rocksdb_writebatch_t* wb1 = rocksdb_writebatch_create(); + rocksdb_writebatch_put(wb1, "baz", 3, "d", 1); + rocksdb_writebatch_put(wb1, "quux", 4, "e", 1); + rocksdb_writebatch_delete(wb1, "quux", 4); + size_t repsize1 = 0; + const char* rep = rocksdb_writebatch_data(wb1, &repsize1); + rocksdb_writebatch_t* wb2 = rocksdb_writebatch_create_from(rep, repsize1); + CheckCondition(rocksdb_writebatch_count(wb1) == + rocksdb_writebatch_count(wb2)); + size_t repsize2 = 0; + CheckCondition( + memcmp(rep, rocksdb_writebatch_data(wb2, &repsize2), repsize1) == 0); + rocksdb_writebatch_destroy(wb1); + rocksdb_writebatch_destroy(wb2); + } + + StartPhase("writebatch_wi"); + { + rocksdb_writebatch_wi_t* wbi = rocksdb_writebatch_wi_create(0, 1); + rocksdb_writebatch_wi_put(wbi, "foo", 3, "a", 1); + rocksdb_writebatch_wi_clear(wbi); + rocksdb_writebatch_wi_put(wbi, "bar", 3, "b", 1); + rocksdb_writebatch_wi_put(wbi, "box", 3, "c", 1); + rocksdb_writebatch_wi_delete(wbi, "bar", 3); + int count = rocksdb_writebatch_wi_count(wbi); + CheckCondition(count == 3); + size_t size; + char* value; + value = rocksdb_writebatch_wi_get_from_batch(wbi, options, "box", 3, &size, &err); + CheckValue(err, "c", &value, size); + value = rocksdb_writebatch_wi_get_from_batch(wbi, options, "bar", 3, &size, &err); + CheckValue(err, NULL, &value, size); + value = rocksdb_writebatch_wi_get_from_batch_and_db(wbi, db, roptions, "foo", 3, &size, &err); + CheckValue(err, "hello", &value, size); + value = rocksdb_writebatch_wi_get_from_batch_and_db(wbi, db, roptions, "box", 3, &size, &err); + CheckValue(err, "c", &value, size); + rocksdb_write_writebatch_wi(db, woptions, wbi, &err); + CheckNoError(err); + CheckGet(db, roptions, "foo", "hello"); + CheckGet(db, roptions, "bar", NULL); + CheckGet(db, roptions, "box", "c"); + int pos = 0; + rocksdb_writebatch_wi_iterate(wbi, &pos, CheckPut, CheckDel); + CheckCondition(pos == 3); + rocksdb_writebatch_wi_clear(wbi); + rocksdb_writebatch_wi_destroy(wbi); + } + + StartPhase("writebatch_wi_vectors"); + { + rocksdb_writebatch_wi_t* wb = rocksdb_writebatch_wi_create(0, 1); + const char* k_list[2] = { "z", "ap" }; + const size_t k_sizes[2] = { 1, 2 }; + const char* v_list[3] = { "x", "y", "z" }; + const size_t v_sizes[3] = { 1, 1, 1 }; + rocksdb_writebatch_wi_putv(wb, 2, k_list, k_sizes, 3, v_list, v_sizes); + rocksdb_write_writebatch_wi(db, woptions, wb, &err); + CheckNoError(err); + CheckGet(db, roptions, "zap", "xyz"); + rocksdb_writebatch_wi_delete(wb, "zap", 3); + rocksdb_write_writebatch_wi(db, woptions, wb, &err); + CheckNoError(err); + CheckGet(db, roptions, "zap", NULL); + rocksdb_writebatch_wi_destroy(wb); + } + + StartPhase("writebatch_wi_savepoint"); + { + rocksdb_writebatch_wi_t* wb = rocksdb_writebatch_wi_create(0, 1); + rocksdb_writebatch_wi_set_save_point(wb); + const char* k_list[2] = {"z", "ap"}; + const size_t k_sizes[2] = {1, 2}; + const char* v_list[3] = {"x", "y", "z"}; + const size_t v_sizes[3] = {1, 1, 1}; + rocksdb_writebatch_wi_putv(wb, 2, k_list, k_sizes, 3, v_list, v_sizes); + rocksdb_writebatch_wi_rollback_to_save_point(wb, &err); + CheckNoError(err); + rocksdb_write_writebatch_wi(db, woptions, wb, &err); + CheckNoError(err); + CheckGet(db, roptions, "zap", NULL); + rocksdb_writebatch_wi_destroy(wb); + } + + StartPhase("iter"); + { + rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions); + CheckCondition(!rocksdb_iter_valid(iter)); + rocksdb_iter_seek_to_first(iter); + CheckCondition(rocksdb_iter_valid(iter)); + CheckIter(iter, "box", "c"); + rocksdb_iter_next(iter); + CheckIter(iter, "foo", "hello"); + rocksdb_iter_prev(iter); + CheckIter(iter, "box", "c"); + rocksdb_iter_prev(iter); + CheckCondition(!rocksdb_iter_valid(iter)); + rocksdb_iter_seek_to_last(iter); + CheckIter(iter, "foo", "hello"); + rocksdb_iter_seek(iter, "b", 1); + CheckIter(iter, "box", "c"); + rocksdb_iter_seek_for_prev(iter, "g", 1); + CheckIter(iter, "foo", "hello"); + rocksdb_iter_seek_for_prev(iter, "box", 3); + CheckIter(iter, "box", "c"); + rocksdb_iter_get_error(iter, &err); + CheckNoError(err); + rocksdb_iter_destroy(iter); + } + + StartPhase("wbwi_iter"); + { + rocksdb_iterator_t* base_iter = rocksdb_create_iterator(db, roptions); + rocksdb_writebatch_wi_t* wbi = rocksdb_writebatch_wi_create(0, 1); + rocksdb_writebatch_wi_put(wbi, "bar", 3, "b", 1); + rocksdb_writebatch_wi_delete(wbi, "foo", 3); + rocksdb_iterator_t* iter = + rocksdb_writebatch_wi_create_iterator_with_base(wbi, base_iter); + CheckCondition(!rocksdb_iter_valid(iter)); + rocksdb_iter_seek_to_first(iter); + CheckCondition(rocksdb_iter_valid(iter)); + CheckIter(iter, "bar", "b"); + rocksdb_iter_next(iter); + CheckIter(iter, "box", "c"); + rocksdb_iter_prev(iter); + CheckIter(iter, "bar", "b"); + rocksdb_iter_prev(iter); + CheckCondition(!rocksdb_iter_valid(iter)); + rocksdb_iter_seek_to_last(iter); + CheckIter(iter, "box", "c"); + rocksdb_iter_seek(iter, "b", 1); + CheckIter(iter, "bar", "b"); + rocksdb_iter_seek_for_prev(iter, "c", 1); + CheckIter(iter, "box", "c"); + rocksdb_iter_seek_for_prev(iter, "box", 3); + CheckIter(iter, "box", "c"); + rocksdb_iter_get_error(iter, &err); + CheckNoError(err); + rocksdb_iter_destroy(iter); + rocksdb_writebatch_wi_destroy(wbi); + } + + StartPhase("multiget"); + { + const char* keys[3] = { "box", "foo", "notfound" }; + const size_t keys_sizes[3] = { 3, 3, 8 }; + char* vals[3]; + size_t vals_sizes[3]; + char* errs[3]; + rocksdb_multi_get(db, roptions, 3, keys, keys_sizes, vals, vals_sizes, errs); + + int i; + for (i = 0; i < 3; i++) { + CheckEqual(NULL, errs[i], 0); + switch (i) { + case 0: + CheckEqual("c", vals[i], vals_sizes[i]); + break; + case 1: + CheckEqual("hello", vals[i], vals_sizes[i]); + break; + case 2: + CheckEqual(NULL, vals[i], vals_sizes[i]); + break; + } + Free(&vals[i]); + } + } + + StartPhase("pin_get"); + { + CheckPinGet(db, roptions, "box", "c"); + CheckPinGet(db, roptions, "foo", "hello"); + CheckPinGet(db, roptions, "notfound", NULL); + } + + StartPhase("approximate_sizes"); + { + int i; + int n = 20000; + char keybuf[100]; + char valbuf[100]; + uint64_t sizes[2]; + const char* start[2] = { "a", "k00000000000000010000" }; + size_t start_len[2] = { 1, 21 }; + const char* limit[2] = { "k00000000000000010000", "z" }; + size_t limit_len[2] = { 21, 1 }; + rocksdb_writeoptions_set_sync(woptions, 0); + for (i = 0; i < n; i++) { + snprintf(keybuf, sizeof(keybuf), "k%020d", i); + snprintf(valbuf, sizeof(valbuf), "v%020d", i); + rocksdb_put(db, woptions, keybuf, strlen(keybuf), valbuf, strlen(valbuf), + &err); + CheckNoError(err); + } + rocksdb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes); + CheckCondition(sizes[0] > 0); + CheckCondition(sizes[1] > 0); + } + + StartPhase("property"); + { + char* prop = rocksdb_property_value(db, "nosuchprop"); + CheckCondition(prop == NULL); + prop = rocksdb_property_value(db, "rocksdb.stats"); + CheckCondition(prop != NULL); + Free(&prop); + } + + StartPhase("snapshot"); + { + const rocksdb_snapshot_t* snap; + snap = rocksdb_create_snapshot(db); + rocksdb_delete(db, woptions, "foo", 3, &err); + CheckNoError(err); + rocksdb_readoptions_set_snapshot(roptions, snap); + CheckGet(db, roptions, "foo", "hello"); + rocksdb_readoptions_set_snapshot(roptions, NULL); + CheckGet(db, roptions, "foo", NULL); + rocksdb_release_snapshot(db, snap); + } + + StartPhase("repair"); + { + // If we do not compact here, then the lazy deletion of + // files (https://reviews.facebook.net/D6123) would leave + // around deleted files and the repair process will find + // those files and put them back into the database. + rocksdb_compact_range(db, NULL, 0, NULL, 0); + rocksdb_close(db); + rocksdb_options_set_create_if_missing(options, 0); + rocksdb_options_set_error_if_exists(options, 0); + rocksdb_options_set_wal_recovery_mode(options, 2); + rocksdb_repair_db(options, dbname, &err); + CheckNoError(err); + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + CheckGet(db, roptions, "foo", NULL); + CheckGet(db, roptions, "bar", NULL); + CheckGet(db, roptions, "box", "c"); + rocksdb_options_set_create_if_missing(options, 1); + rocksdb_options_set_error_if_exists(options, 1); + } + + StartPhase("filter"); + for (run = 0; run <= 2; run++) { + // First run uses custom filter + // Second run uses old block-based bloom filter + // Third run uses full bloom filter + CheckNoError(err); + rocksdb_filterpolicy_t* policy; + if (run == 0) { + policy = rocksdb_filterpolicy_create(NULL, FilterDestroy, FilterCreate, + FilterKeyMatch, NULL, FilterName); + } else if (run == 1) { + policy = rocksdb_filterpolicy_create_bloom(8); + } else { + policy = rocksdb_filterpolicy_create_bloom_full(8); + } + rocksdb_block_based_options_set_filter_policy(table_options, policy); + + // Create new database + rocksdb_close(db); + rocksdb_destroy_db(options, dbname, &err); + rocksdb_options_set_block_based_table_factory(options, table_options); + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "bar", 3, "barvalue", 8, &err); + CheckNoError(err); + + { + // Add enough keys to get just one reasonably populated Bloom filter + const int keys_to_add = 1500; + int i; + char keybuf[100]; + for (i = 0; i < keys_to_add; i++) { + snprintf(keybuf, sizeof(keybuf), "yes%020d", i); + rocksdb_put(db, woptions, keybuf, strlen(keybuf), "val", 3, &err); + CheckNoError(err); + } + } + rocksdb_compact_range(db, NULL, 0, NULL, 0); + + fake_filter_result = 1; + CheckGet(db, roptions, "foo", "foovalue"); + CheckGet(db, roptions, "bar", "barvalue"); + if (run == 0) { + // Must not find value when custom filter returns false + fake_filter_result = 0; + CheckGet(db, roptions, "foo", NULL); + CheckGet(db, roptions, "bar", NULL); + fake_filter_result = 1; + + CheckGet(db, roptions, "foo", "foovalue"); + CheckGet(db, roptions, "bar", "barvalue"); + } + + { + // Query some keys not added to identify Bloom filter implementation + // from false positive queries, using perfcontext to detect Bloom + // filter behavior + rocksdb_perfcontext_t* perf = rocksdb_perfcontext_create(); + rocksdb_perfcontext_reset(perf); + + const int keys_to_query = 10000; + int i; + char keybuf[100]; + for (i = 0; i < keys_to_query; i++) { + fake_filter_result = i % 2; + snprintf(keybuf, sizeof(keybuf), "no%020d", i); + CheckGet(db, roptions, keybuf, NULL); + } + + const int hits = + (int)rocksdb_perfcontext_metric(perf, rocksdb_bloom_sst_hit_count); + if (run == 0) { + // Due to half true, half false with fake filter result + CheckCondition(hits == keys_to_query / 2); + } else if (run == 1) { + // Essentially a fingerprint of the block-based Bloom schema + CheckCondition(hits == 241); + } else { + // Essentially a fingerprint of the full Bloom schema(s), + // format_version < 5, which vary for three different CACHE_LINE_SIZEs + CheckCondition(hits == 224 || hits == 180 || hits == 125); + } + CheckCondition( + (keys_to_query - hits) == + (int)rocksdb_perfcontext_metric(perf, rocksdb_bloom_sst_miss_count)); + + rocksdb_perfcontext_destroy(perf); + } + + // Reset the policy + rocksdb_block_based_options_set_filter_policy(table_options, NULL); + rocksdb_options_set_block_based_table_factory(options, table_options); + } + + StartPhase("compaction_filter"); + { + rocksdb_options_t* options_with_filter = rocksdb_options_create(); + rocksdb_options_set_create_if_missing(options_with_filter, 1); + rocksdb_compactionfilter_t* cfilter; + cfilter = rocksdb_compactionfilter_create(NULL, CFilterDestroy, + CFilterFilter, CFilterName); + // Create new database + rocksdb_close(db); + rocksdb_destroy_db(options_with_filter, dbname, &err); + rocksdb_options_set_compaction_filter(options_with_filter, cfilter); + db = CheckCompaction(db, options_with_filter, roptions, woptions); + + rocksdb_options_set_compaction_filter(options_with_filter, NULL); + rocksdb_compactionfilter_destroy(cfilter); + rocksdb_options_destroy(options_with_filter); + } + + StartPhase("compaction_filter_factory"); + { + rocksdb_options_t* options_with_filter_factory = rocksdb_options_create(); + rocksdb_options_set_create_if_missing(options_with_filter_factory, 1); + rocksdb_compactionfilterfactory_t* factory; + factory = rocksdb_compactionfilterfactory_create( + NULL, CFilterFactoryDestroy, CFilterCreate, CFilterFactoryName); + // Create new database + rocksdb_close(db); + rocksdb_destroy_db(options_with_filter_factory, dbname, &err); + rocksdb_options_set_compaction_filter_factory(options_with_filter_factory, + factory); + db = CheckCompaction(db, options_with_filter_factory, roptions, woptions); + + rocksdb_options_set_compaction_filter_factory( + options_with_filter_factory, NULL); + rocksdb_options_destroy(options_with_filter_factory); + } + + StartPhase("merge_operator"); + { + rocksdb_mergeoperator_t* merge_operator; + merge_operator = rocksdb_mergeoperator_create( + NULL, MergeOperatorDestroy, MergeOperatorFullMerge, + MergeOperatorPartialMerge, NULL, MergeOperatorName); + // Create new database + rocksdb_close(db); + rocksdb_destroy_db(options, dbname, &err); + rocksdb_options_set_merge_operator(options, merge_operator); + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err); + CheckNoError(err); + CheckGet(db, roptions, "foo", "foovalue"); + rocksdb_merge(db, woptions, "foo", 3, "barvalue", 8, &err); + CheckNoError(err); + CheckGet(db, roptions, "foo", "fake"); + + // Merge of a non-existing value + rocksdb_merge(db, woptions, "bar", 3, "barvalue", 8, &err); + CheckNoError(err); + CheckGet(db, roptions, "bar", "fake"); + + } + + StartPhase("columnfamilies"); + { + rocksdb_close(db); + rocksdb_destroy_db(options, dbname, &err); + CheckNoError(err); + + rocksdb_options_t* db_options = rocksdb_options_create(); + rocksdb_options_set_create_if_missing(db_options, 1); + db = rocksdb_open(db_options, dbname, &err); + CheckNoError(err) + rocksdb_column_family_handle_t* cfh; + cfh = rocksdb_create_column_family(db, db_options, "cf1", &err); + rocksdb_column_family_handle_destroy(cfh); + CheckNoError(err); + rocksdb_close(db); + + size_t cflen; + char** column_fams = rocksdb_list_column_families(db_options, dbname, &cflen, &err); + CheckNoError(err); + CheckEqual("default", column_fams[0], 7); + CheckEqual("cf1", column_fams[1], 3); + CheckCondition(cflen == 2); + rocksdb_list_column_families_destroy(column_fams, cflen); + + rocksdb_options_t* cf_options = rocksdb_options_create(); + + const char* cf_names[2] = {"default", "cf1"}; + const rocksdb_options_t* cf_opts[2] = {cf_options, cf_options}; + rocksdb_column_family_handle_t* handles[2]; + db = rocksdb_open_column_families(db_options, dbname, 2, cf_names, cf_opts, handles, &err); + CheckNoError(err); + + rocksdb_put_cf(db, woptions, handles[1], "foo", 3, "hello", 5, &err); + CheckNoError(err); + + rocksdb_put_cf(db, woptions, handles[1], "foobar1", 7, "hello1", 6, &err); + CheckNoError(err); + rocksdb_put_cf(db, woptions, handles[1], "foobar2", 7, "hello2", 6, &err); + CheckNoError(err); + rocksdb_put_cf(db, woptions, handles[1], "foobar3", 7, "hello3", 6, &err); + CheckNoError(err); + rocksdb_put_cf(db, woptions, handles[1], "foobar4", 7, "hello4", 6, &err); + CheckNoError(err); + + rocksdb_flushoptions_t *flush_options = rocksdb_flushoptions_create(); + rocksdb_flushoptions_set_wait(flush_options, 1); + rocksdb_flush_cf(db, flush_options, handles[1], &err); + CheckNoError(err) + rocksdb_flushoptions_destroy(flush_options); + + CheckGetCF(db, roptions, handles[1], "foo", "hello"); + CheckPinGetCF(db, roptions, handles[1], "foo", "hello"); + + rocksdb_delete_cf(db, woptions, handles[1], "foo", 3, &err); + CheckNoError(err); + + rocksdb_delete_range_cf(db, woptions, handles[1], "foobar2", 7, "foobar4", + 7, &err); + CheckNoError(err); + + CheckGetCF(db, roptions, handles[1], "foo", NULL); + CheckPinGetCF(db, roptions, handles[1], "foo", NULL); + + rocksdb_writebatch_t* wb = rocksdb_writebatch_create(); + rocksdb_writebatch_put_cf(wb, handles[1], "baz", 3, "a", 1); + rocksdb_writebatch_clear(wb); + rocksdb_writebatch_put_cf(wb, handles[1], "bar", 3, "b", 1); + rocksdb_writebatch_put_cf(wb, handles[1], "box", 3, "c", 1); + rocksdb_writebatch_delete_cf(wb, handles[1], "bar", 3); + rocksdb_write(db, woptions, wb, &err); + CheckNoError(err); + CheckGetCF(db, roptions, handles[1], "baz", NULL); + CheckGetCF(db, roptions, handles[1], "bar", NULL); + CheckGetCF(db, roptions, handles[1], "box", "c"); + CheckPinGetCF(db, roptions, handles[1], "baz", NULL); + CheckPinGetCF(db, roptions, handles[1], "bar", NULL); + CheckPinGetCF(db, roptions, handles[1], "box", "c"); + rocksdb_writebatch_destroy(wb); + + const char* keys[3] = { "box", "box", "barfooxx" }; + const rocksdb_column_family_handle_t* get_handles[3] = { handles[0], handles[1], handles[1] }; + const size_t keys_sizes[3] = { 3, 3, 8 }; + char* vals[3]; + size_t vals_sizes[3]; + char* errs[3]; + rocksdb_multi_get_cf(db, roptions, get_handles, 3, keys, keys_sizes, vals, vals_sizes, errs); + + int i; + for (i = 0; i < 3; i++) { + CheckEqual(NULL, errs[i], 0); + switch (i) { + case 0: + CheckEqual(NULL, vals[i], vals_sizes[i]); // wrong cf + break; + case 1: + CheckEqual("c", vals[i], vals_sizes[i]); // bingo + break; + case 2: + CheckEqual(NULL, vals[i], vals_sizes[i]); // normal not found + break; + } + Free(&vals[i]); + } + + rocksdb_iterator_t* iter = rocksdb_create_iterator_cf(db, roptions, handles[1]); + CheckCondition(!rocksdb_iter_valid(iter)); + rocksdb_iter_seek_to_first(iter); + CheckCondition(rocksdb_iter_valid(iter)); + + for (i = 0; rocksdb_iter_valid(iter) != 0; rocksdb_iter_next(iter)) { + i++; + } + CheckCondition(i == 3); + rocksdb_iter_get_error(iter, &err); + CheckNoError(err); + rocksdb_iter_destroy(iter); + + rocksdb_column_family_handle_t* iters_cf_handles[2] = { handles[0], handles[1] }; + rocksdb_iterator_t* iters_handles[2]; + rocksdb_create_iterators(db, roptions, iters_cf_handles, iters_handles, 2, &err); + CheckNoError(err); + + iter = iters_handles[0]; + CheckCondition(!rocksdb_iter_valid(iter)); + rocksdb_iter_seek_to_first(iter); + CheckCondition(!rocksdb_iter_valid(iter)); + rocksdb_iter_destroy(iter); + + iter = iters_handles[1]; + CheckCondition(!rocksdb_iter_valid(iter)); + rocksdb_iter_seek_to_first(iter); + CheckCondition(rocksdb_iter_valid(iter)); + + for (i = 0; rocksdb_iter_valid(iter) != 0; rocksdb_iter_next(iter)) { + i++; + } + CheckCondition(i == 3); + rocksdb_iter_get_error(iter, &err); + CheckNoError(err); + rocksdb_iter_destroy(iter); + + rocksdb_drop_column_family(db, handles[1], &err); + CheckNoError(err); + for (i = 0; i < 2; i++) { + rocksdb_column_family_handle_destroy(handles[i]); + } + rocksdb_close(db); + rocksdb_destroy_db(options, dbname, &err); + rocksdb_options_destroy(db_options); + rocksdb_options_destroy(cf_options); + } + + StartPhase("prefix"); + { + // Create new database + rocksdb_options_set_allow_mmap_reads(options, 1); + rocksdb_options_set_prefix_extractor(options, rocksdb_slicetransform_create_fixed_prefix(3)); + rocksdb_options_set_hash_skip_list_rep(options, 5000, 4, 4); + rocksdb_options_set_plain_table_factory(options, 4, 10, 0.75, 16); + rocksdb_options_set_allow_concurrent_memtable_write(options, 0); + + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + + rocksdb_put(db, woptions, "foo1", 4, "foo", 3, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "foo2", 4, "foo", 3, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "foo3", 4, "foo", 3, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "bar1", 4, "bar", 3, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "bar2", 4, "bar", 3, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "bar3", 4, "bar", 3, &err); + CheckNoError(err); + + rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions); + CheckCondition(!rocksdb_iter_valid(iter)); + + rocksdb_iter_seek(iter, "bar", 3); + rocksdb_iter_get_error(iter, &err); + CheckNoError(err); + CheckCondition(rocksdb_iter_valid(iter)); + + CheckIter(iter, "bar1", "bar"); + rocksdb_iter_next(iter); + CheckIter(iter, "bar2", "bar"); + rocksdb_iter_next(iter); + CheckIter(iter, "bar3", "bar"); + rocksdb_iter_get_error(iter, &err); + CheckNoError(err); + rocksdb_iter_destroy(iter); + + rocksdb_readoptions_set_total_order_seek(roptions, 1); + iter = rocksdb_create_iterator(db, roptions); + CheckCondition(!rocksdb_iter_valid(iter)); + + rocksdb_iter_seek(iter, "ba", 2); + rocksdb_iter_get_error(iter, &err); + CheckNoError(err); + CheckCondition(rocksdb_iter_valid(iter)); + CheckIter(iter, "bar1", "bar"); + + rocksdb_iter_destroy(iter); + rocksdb_readoptions_set_total_order_seek(roptions, 0); + + rocksdb_close(db); + rocksdb_destroy_db(options, dbname, &err); + } + + // Check memory usage stats + StartPhase("approximate_memory_usage"); + { + // Create database + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + + rocksdb_memory_consumers_t* consumers; + consumers = rocksdb_memory_consumers_create(); + rocksdb_memory_consumers_add_db(consumers, db); + rocksdb_memory_consumers_add_cache(consumers, cache); + + // take memory usage report before write-read operation + rocksdb_memory_usage_t* mu1; + mu1 = rocksdb_approximate_memory_usage_create(consumers, &err); + CheckNoError(err); + + // Put data (this should affect memtables) + rocksdb_put(db, woptions, "memory", 6, "test", 4, &err); + CheckNoError(err); + CheckGet(db, roptions, "memory", "test"); + + // take memory usage report after write-read operation + rocksdb_memory_usage_t* mu2; + mu2 = rocksdb_approximate_memory_usage_create(consumers, &err); + CheckNoError(err); + + // amount of memory used within memtables should grow + CheckCondition(rocksdb_approximate_memory_usage_get_mem_table_total(mu2) >= + rocksdb_approximate_memory_usage_get_mem_table_total(mu1)); + CheckCondition(rocksdb_approximate_memory_usage_get_mem_table_unflushed(mu2) >= + rocksdb_approximate_memory_usage_get_mem_table_unflushed(mu1)); + + rocksdb_memory_consumers_destroy(consumers); + rocksdb_approximate_memory_usage_destroy(mu1); + rocksdb_approximate_memory_usage_destroy(mu2); + rocksdb_close(db); + rocksdb_destroy_db(options, dbname, &err); + CheckNoError(err); + } + + StartPhase("cuckoo_options"); + { + rocksdb_cuckoo_table_options_t* cuckoo_options; + cuckoo_options = rocksdb_cuckoo_options_create(); + rocksdb_cuckoo_options_set_hash_ratio(cuckoo_options, 0.5); + rocksdb_cuckoo_options_set_max_search_depth(cuckoo_options, 200); + rocksdb_cuckoo_options_set_cuckoo_block_size(cuckoo_options, 10); + rocksdb_cuckoo_options_set_identity_as_first_hash(cuckoo_options, 1); + rocksdb_cuckoo_options_set_use_module_hash(cuckoo_options, 0); + rocksdb_options_set_cuckoo_table_factory(options, cuckoo_options); + + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + + rocksdb_cuckoo_options_destroy(cuckoo_options); + } + + StartPhase("iterate_upper_bound"); + { + // Create new empty database + rocksdb_close(db); + rocksdb_destroy_db(options, dbname, &err); + CheckNoError(err); + + rocksdb_options_set_prefix_extractor(options, NULL); + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + + rocksdb_put(db, woptions, "a", 1, "0", 1, &err); CheckNoError(err); + rocksdb_put(db, woptions, "foo", 3, "bar", 3, &err); CheckNoError(err); + rocksdb_put(db, woptions, "foo1", 4, "bar1", 4, &err); CheckNoError(err); + rocksdb_put(db, woptions, "g1", 2, "0", 1, &err); CheckNoError(err); + + // testing basic case with no iterate_upper_bound and no prefix_extractor + { + rocksdb_readoptions_set_iterate_upper_bound(roptions, NULL, 0); + rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions); + + rocksdb_iter_seek(iter, "foo", 3); + CheckCondition(rocksdb_iter_valid(iter)); + CheckIter(iter, "foo", "bar"); + + rocksdb_iter_next(iter); + CheckCondition(rocksdb_iter_valid(iter)); + CheckIter(iter, "foo1", "bar1"); + + rocksdb_iter_next(iter); + CheckCondition(rocksdb_iter_valid(iter)); + CheckIter(iter, "g1", "0"); + + rocksdb_iter_destroy(iter); + } + + // testing iterate_upper_bound and forward iterator + // to make sure it stops at bound + { + // iterate_upper_bound points beyond the last expected entry + rocksdb_readoptions_set_iterate_upper_bound(roptions, "foo2", 4); + + rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions); + + rocksdb_iter_seek(iter, "foo", 3); + CheckCondition(rocksdb_iter_valid(iter)); + CheckIter(iter, "foo", "bar"); + + rocksdb_iter_next(iter); + CheckCondition(rocksdb_iter_valid(iter)); + CheckIter(iter, "foo1", "bar1"); + + rocksdb_iter_next(iter); + // should stop here... + CheckCondition(!rocksdb_iter_valid(iter)); + + rocksdb_iter_destroy(iter); + rocksdb_readoptions_set_iterate_upper_bound(roptions, NULL, 0); + } + } + + StartPhase("transactions"); + { + rocksdb_close(db); + rocksdb_destroy_db(options, dbname, &err); + CheckNoError(err); + + // open a TransactionDB + txn_db_options = rocksdb_transactiondb_options_create(); + txn_options = rocksdb_transaction_options_create(); + rocksdb_options_set_create_if_missing(options, 1); + txn_db = rocksdb_transactiondb_open(options, txn_db_options, dbname, &err); + CheckNoError(err); + + // put outside a transaction + rocksdb_transactiondb_put(txn_db, woptions, "foo", 3, "hello", 5, &err); + CheckNoError(err); + CheckTxnDBGet(txn_db, roptions, "foo", "hello"); + + // delete from outside transaction + rocksdb_transactiondb_delete(txn_db, woptions, "foo", 3, &err); + CheckNoError(err); + CheckTxnDBGet(txn_db, roptions, "foo", NULL); + + // write batch into TransactionDB + rocksdb_writebatch_t* wb = rocksdb_writebatch_create(); + rocksdb_writebatch_put(wb, "foo", 3, "a", 1); + rocksdb_writebatch_clear(wb); + rocksdb_writebatch_put(wb, "bar", 3, "b", 1); + rocksdb_writebatch_put(wb, "box", 3, "c", 1); + rocksdb_writebatch_delete(wb, "bar", 3); + rocksdb_transactiondb_write(txn_db, woptions, wb, &err); + rocksdb_writebatch_destroy(wb); + CheckTxnDBGet(txn_db, roptions, "box", "c"); + CheckNoError(err); + + // begin a transaction + txn = rocksdb_transaction_begin(txn_db, woptions, txn_options, NULL); + // put + rocksdb_transaction_put(txn, "foo", 3, "hello", 5, &err); + CheckNoError(err); + CheckTxnGet(txn, roptions, "foo", "hello"); + // delete + rocksdb_transaction_delete(txn, "foo", 3, &err); + CheckNoError(err); + CheckTxnGet(txn, roptions, "foo", NULL); + + rocksdb_transaction_put(txn, "foo", 3, "hello", 5, &err); + CheckNoError(err); + + // read from outside transaction, before commit + CheckTxnDBGet(txn_db, roptions, "foo", NULL); + + // commit + rocksdb_transaction_commit(txn, &err); + CheckNoError(err); + + // read from outside transaction, after commit + CheckTxnDBGet(txn_db, roptions, "foo", "hello"); + + // reuse old transaction + txn = rocksdb_transaction_begin(txn_db, woptions, txn_options, txn); + + // snapshot + const rocksdb_snapshot_t* snapshot; + snapshot = rocksdb_transactiondb_create_snapshot(txn_db); + rocksdb_readoptions_set_snapshot(roptions, snapshot); + + rocksdb_transactiondb_put(txn_db, woptions, "foo", 3, "hey", 3, &err); + CheckNoError(err); + + CheckTxnDBGet(txn_db, roptions, "foo", "hello"); + rocksdb_readoptions_set_snapshot(roptions, NULL); + rocksdb_transactiondb_release_snapshot(txn_db, snapshot); + CheckTxnDBGet(txn_db, roptions, "foo", "hey"); + + // iterate + rocksdb_transaction_put(txn, "bar", 3, "hi", 2, &err); + rocksdb_iterator_t* iter = rocksdb_transaction_create_iterator(txn, roptions); + CheckCondition(!rocksdb_iter_valid(iter)); + rocksdb_iter_seek_to_first(iter); + CheckCondition(rocksdb_iter_valid(iter)); + CheckIter(iter, "bar", "hi"); + rocksdb_iter_get_error(iter, &err); + CheckNoError(err); + rocksdb_iter_destroy(iter); + + // rollback + rocksdb_transaction_rollback(txn, &err); + CheckNoError(err); + CheckTxnDBGet(txn_db, roptions, "bar", NULL); + + // save point + rocksdb_transaction_put(txn, "foo1", 4, "hi1", 3, &err); + rocksdb_transaction_set_savepoint(txn); + CheckTxnGet(txn, roptions, "foo1", "hi1"); + rocksdb_transaction_put(txn, "foo2", 4, "hi2", 3, &err); + CheckTxnGet(txn, roptions, "foo2", "hi2"); + + // rollback to savepoint + rocksdb_transaction_rollback_to_savepoint(txn, &err); + CheckNoError(err); + CheckTxnGet(txn, roptions, "foo2", NULL); + CheckTxnGet(txn, roptions, "foo1", "hi1"); + CheckTxnDBGet(txn_db, roptions, "foo1", NULL); + CheckTxnDBGet(txn_db, roptions, "foo2", NULL); + rocksdb_transaction_commit(txn, &err); + CheckNoError(err); + CheckTxnDBGet(txn_db, roptions, "foo1", "hi1"); + CheckTxnDBGet(txn_db, roptions, "foo2", NULL); + + // Column families. + rocksdb_column_family_handle_t* cfh; + cfh = rocksdb_transactiondb_create_column_family(txn_db, options, + "txn_db_cf", &err); + CheckNoError(err); + + rocksdb_transactiondb_put_cf(txn_db, woptions, cfh, "cf_foo", 6, "cf_hello", + 8, &err); + CheckNoError(err); + CheckTxnDBGetCF(txn_db, roptions, cfh, "cf_foo", "cf_hello"); + + rocksdb_transactiondb_delete_cf(txn_db, woptions, cfh, "cf_foo", 6, &err); + CheckNoError(err); + CheckTxnDBGetCF(txn_db, roptions, cfh, "cf_foo", NULL); + + rocksdb_column_family_handle_destroy(cfh); + + // close and destroy + rocksdb_transaction_destroy(txn); + rocksdb_transactiondb_close(txn_db); + rocksdb_destroy_db(options, dbname, &err); + CheckNoError(err); + rocksdb_transaction_options_destroy(txn_options); + rocksdb_transactiondb_options_destroy(txn_db_options); + } + + StartPhase("optimistic_transactions"); + { + rocksdb_options_t* db_options = rocksdb_options_create(); + rocksdb_options_set_create_if_missing(db_options, 1); + rocksdb_options_set_allow_concurrent_memtable_write(db_options, 1); + otxn_db = rocksdb_optimistictransactiondb_open(db_options, dbname, &err); + otxn_options = rocksdb_optimistictransaction_options_create(); + rocksdb_transaction_t* txn1 = rocksdb_optimistictransaction_begin( + otxn_db, woptions, otxn_options, NULL); + rocksdb_transaction_t* txn2 = rocksdb_optimistictransaction_begin( + otxn_db, woptions, otxn_options, NULL); + rocksdb_transaction_put(txn1, "key", 3, "value", 5, &err); + CheckNoError(err); + rocksdb_transaction_put(txn2, "key1", 4, "value1", 6, &err); + CheckNoError(err); + CheckTxnGet(txn1, roptions, "key", "value"); + rocksdb_transaction_commit(txn1, &err); + CheckNoError(err); + rocksdb_transaction_commit(txn2, &err); + CheckNoError(err); + rocksdb_transaction_destroy(txn1); + rocksdb_transaction_destroy(txn2); + + // Check column family + db = rocksdb_optimistictransactiondb_get_base_db(otxn_db); + rocksdb_put(db, woptions, "key", 3, "value", 5, &err); + CheckNoError(err); + rocksdb_column_family_handle_t *cfh1, *cfh2; + cfh1 = rocksdb_create_column_family(db, db_options, "txn_db_cf1", &err); + cfh2 = rocksdb_create_column_family(db, db_options, "txn_db_cf2", &err); + txn = rocksdb_optimistictransaction_begin(otxn_db, woptions, otxn_options, + NULL); + rocksdb_transaction_put_cf(txn, cfh1, "key_cf1", 7, "val_cf1", 7, &err); + CheckNoError(err); + rocksdb_transaction_put_cf(txn, cfh2, "key_cf2", 7, "val_cf2", 7, &err); + CheckNoError(err); + rocksdb_transaction_commit(txn, &err); + CheckNoError(err); + txn = rocksdb_optimistictransaction_begin(otxn_db, woptions, otxn_options, + txn); + CheckGetCF(db, roptions, cfh1, "key_cf1", "val_cf1"); + CheckTxnGetCF(txn, roptions, cfh1, "key_cf1", "val_cf1"); + + // Check iterator with column family + rocksdb_transaction_put_cf(txn, cfh1, "key1_cf", 7, "val1_cf", 7, &err); + CheckNoError(err); + rocksdb_iterator_t* iter = + rocksdb_transaction_create_iterator_cf(txn, roptions, cfh1); + CheckCondition(!rocksdb_iter_valid(iter)); + rocksdb_iter_seek_to_first(iter); + CheckCondition(rocksdb_iter_valid(iter)); + CheckIter(iter, "key1_cf", "val1_cf"); + rocksdb_iter_get_error(iter, &err); + CheckNoError(err); + rocksdb_iter_destroy(iter); + + rocksdb_transaction_destroy(txn); + rocksdb_column_family_handle_destroy(cfh1); + rocksdb_column_family_handle_destroy(cfh2); + rocksdb_optimistictransactiondb_close_base_db(db); + rocksdb_optimistictransactiondb_close(otxn_db); + + // Check open optimistic transaction db with column families + size_t cf_len; + char** column_fams = + rocksdb_list_column_families(db_options, dbname, &cf_len, &err); + CheckNoError(err); + CheckEqual("default", column_fams[0], 7); + CheckEqual("txn_db_cf1", column_fams[1], 10); + CheckEqual("txn_db_cf2", column_fams[2], 10); + CheckCondition(cf_len == 3); + rocksdb_list_column_families_destroy(column_fams, cf_len); + + const char* cf_names[3] = {"default", "txn_db_cf1", "txn_db_cf2"}; + rocksdb_options_t* cf_options = rocksdb_options_create(); + const rocksdb_options_t* cf_opts[3] = {cf_options, cf_options, cf_options}; + + rocksdb_options_set_error_if_exists(cf_options, 0); + rocksdb_column_family_handle_t* cf_handles[3]; + otxn_db = rocksdb_optimistictransactiondb_open_column_families( + db_options, dbname, 3, cf_names, cf_opts, cf_handles, &err); + CheckNoError(err); + rocksdb_transaction_t* txn_cf = rocksdb_optimistictransaction_begin( + otxn_db, woptions, otxn_options, NULL); + CheckTxnGetCF(txn_cf, roptions, cf_handles[0], "key", "value"); + CheckTxnGetCF(txn_cf, roptions, cf_handles[1], "key_cf1", "val_cf1"); + CheckTxnGetCF(txn_cf, roptions, cf_handles[2], "key_cf2", "val_cf2"); + rocksdb_transaction_destroy(txn_cf); + rocksdb_options_destroy(cf_options); + rocksdb_column_family_handle_destroy(cf_handles[0]); + rocksdb_column_family_handle_destroy(cf_handles[1]); + rocksdb_column_family_handle_destroy(cf_handles[2]); + rocksdb_optimistictransactiondb_close(otxn_db); + rocksdb_destroy_db(db_options, dbname, &err); + rocksdb_options_destroy(db_options); + rocksdb_optimistictransaction_options_destroy(otxn_options); + CheckNoError(err); + } + + // Simple sanity check that setting memtable rep works. + StartPhase("memtable_reps"); + { + // Create database with vector memtable. + rocksdb_options_set_memtable_vector_rep(options); + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + + // Create database with hash skiplist memtable. + rocksdb_close(db); + rocksdb_destroy_db(options, dbname, &err); + CheckNoError(err); + + rocksdb_options_set_hash_skip_list_rep(options, 5000, 4, 4); + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + } + + // Check that secondary instance works. + StartPhase("open_as_secondary"); + { + rocksdb_close(db); + rocksdb_destroy_db(options, dbname, &err); + + rocksdb_options_t* db_options = rocksdb_options_create(); + rocksdb_options_set_create_if_missing(db_options, 1); + db = rocksdb_open(db_options, dbname, &err); + CheckNoError(err); + rocksdb_t* db1; + rocksdb_options_t* opts = rocksdb_options_create(); + rocksdb_options_set_max_open_files(opts, -1); + rocksdb_options_set_create_if_missing(opts, 1); + snprintf(secondary_path, sizeof(secondary_path), + "%s/rocksdb_c_test_secondary-%d", GetTempDir(), ((int)geteuid())); + db1 = rocksdb_open_as_secondary(opts, dbname, secondary_path, &err); + CheckNoError(err); + + rocksdb_writeoptions_set_sync(woptions, 0); + rocksdb_writeoptions_disable_WAL(woptions, 1); + rocksdb_put(db, woptions, "key0", 4, "value0", 6, &err); + CheckNoError(err); + rocksdb_flushoptions_t* flush_opts = rocksdb_flushoptions_create(); + rocksdb_flushoptions_set_wait(flush_opts, 1); + rocksdb_flush(db, flush_opts, &err); + CheckNoError(err); + rocksdb_try_catch_up_with_primary(db1, &err); + CheckNoError(err); + rocksdb_readoptions_t* ropts = rocksdb_readoptions_create(); + rocksdb_readoptions_set_verify_checksums(ropts, 1); + rocksdb_readoptions_set_snapshot(ropts, NULL); + CheckGet(db, ropts, "key0", "value0"); + CheckGet(db1, ropts, "key0", "value0"); + + rocksdb_writeoptions_disable_WAL(woptions, 0); + rocksdb_put(db, woptions, "key1", 4, "value1", 6, &err); + CheckNoError(err); + rocksdb_try_catch_up_with_primary(db1, &err); + CheckNoError(err); + CheckGet(db1, ropts, "key0", "value0"); + CheckGet(db1, ropts, "key1", "value1"); + + rocksdb_close(db1); + rocksdb_destroy_db(opts, secondary_path, &err); + CheckNoError(err); + + rocksdb_options_destroy(db_options); + rocksdb_options_destroy(opts); + rocksdb_readoptions_destroy(ropts); + rocksdb_flushoptions_destroy(flush_opts); + } + + // Simple sanity check that options setting db_paths work. + StartPhase("open_db_paths"); + { + rocksdb_close(db); + rocksdb_destroy_db(options, dbname, &err); + + const rocksdb_dbpath_t* paths[1] = {dbpath}; + rocksdb_options_set_db_paths(options, paths, 1); + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + } + + StartPhase("cleanup"); + rocksdb_close(db); + rocksdb_options_destroy(options); + rocksdb_block_based_options_destroy(table_options); + rocksdb_readoptions_destroy(roptions); + rocksdb_writeoptions_destroy(woptions); + rocksdb_compactoptions_destroy(coptions); + rocksdb_cache_destroy(cache); + rocksdb_comparator_destroy(cmp); + rocksdb_dbpath_destroy(dbpath); + rocksdb_env_destroy(env); + + fprintf(stderr, "PASS\n"); + return 0; +} + +#else + +int main() { + fprintf(stderr, "SKIPPED\n"); + return 0; +} + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/column_family.cc b/src/rocksdb/db/column_family.cc new file mode 100644 index 000000000..928a02a1f --- /dev/null +++ b/src/rocksdb/db/column_family.cc @@ -0,0 +1,1523 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/column_family.h" + +#include +#include +#include +#include +#include + +#include "db/compaction/compaction_picker.h" +#include "db/compaction/compaction_picker_fifo.h" +#include "db/compaction/compaction_picker_level.h" +#include "db/compaction/compaction_picker_universal.h" +#include "db/db_impl/db_impl.h" +#include "db/internal_stats.h" +#include "db/job_context.h" +#include "db/range_del_aggregator.h" +#include "db/table_properties_collector.h" +#include "db/version_set.h" +#include "db/write_controller.h" +#include "file/sst_file_manager_impl.h" +#include "memtable/hash_skiplist_rep.h" +#include "monitoring/thread_status_util.h" +#include "options/options_helper.h" +#include "port/port.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/merging_iterator.h" +#include "util/autovector.h" +#include "util/compression.h" + +namespace ROCKSDB_NAMESPACE { + +ColumnFamilyHandleImpl::ColumnFamilyHandleImpl( + ColumnFamilyData* column_family_data, DBImpl* db, InstrumentedMutex* mutex) + : cfd_(column_family_data), db_(db), mutex_(mutex) { + if (cfd_ != nullptr) { + cfd_->Ref(); + } +} + +ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() { + if (cfd_ != nullptr) { +#ifndef ROCKSDB_LITE + for (auto& listener : cfd_->ioptions()->listeners) { + listener->OnColumnFamilyHandleDeletionStarted(this); + } +#endif // ROCKSDB_LITE + // Job id == 0 means that this is not our background process, but rather + // user thread + // Need to hold some shared pointers owned by the initial_cf_options + // before final cleaning up finishes. + ColumnFamilyOptions initial_cf_options_copy = cfd_->initial_cf_options(); + JobContext job_context(0); + mutex_->Lock(); + bool dropped = cfd_->IsDropped(); + if (cfd_->UnrefAndTryDelete()) { + if (dropped) { + db_->FindObsoleteFiles(&job_context, false, true); + } + } + mutex_->Unlock(); + if (job_context.HaveSomethingToDelete()) { + bool defer_purge = + db_->immutable_db_options().avoid_unnecessary_blocking_io; + db_->PurgeObsoleteFiles(job_context, defer_purge); + if (defer_purge) { + mutex_->Lock(); + db_->SchedulePurge(); + mutex_->Unlock(); + } + } + job_context.Clean(); + } +} + +uint32_t ColumnFamilyHandleImpl::GetID() const { return cfd()->GetID(); } + +const std::string& ColumnFamilyHandleImpl::GetName() const { + return cfd()->GetName(); +} + +Status ColumnFamilyHandleImpl::GetDescriptor(ColumnFamilyDescriptor* desc) { +#ifndef ROCKSDB_LITE + // accessing mutable cf-options requires db mutex. + InstrumentedMutexLock l(mutex_); + *desc = ColumnFamilyDescriptor(cfd()->GetName(), cfd()->GetLatestCFOptions()); + return Status::OK(); +#else + (void)desc; + return Status::NotSupported(); +#endif // !ROCKSDB_LITE +} + +const Comparator* ColumnFamilyHandleImpl::GetComparator() const { + return cfd()->user_comparator(); +} + +void GetIntTblPropCollectorFactory( + const ImmutableCFOptions& ioptions, + std::vector>* + int_tbl_prop_collector_factories) { + auto& collector_factories = ioptions.table_properties_collector_factories; + for (size_t i = 0; i < ioptions.table_properties_collector_factories.size(); + ++i) { + assert(collector_factories[i]); + int_tbl_prop_collector_factories->emplace_back( + new UserKeyTablePropertiesCollectorFactory(collector_factories[i])); + } +} + +Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options) { + if (!cf_options.compression_per_level.empty()) { + for (size_t level = 0; level < cf_options.compression_per_level.size(); + ++level) { + if (!CompressionTypeSupported(cf_options.compression_per_level[level])) { + return Status::InvalidArgument( + "Compression type " + + CompressionTypeToString(cf_options.compression_per_level[level]) + + " is not linked with the binary."); + } + } + } else { + if (!CompressionTypeSupported(cf_options.compression)) { + return Status::InvalidArgument( + "Compression type " + + CompressionTypeToString(cf_options.compression) + + " is not linked with the binary."); + } + } + if (cf_options.compression_opts.zstd_max_train_bytes > 0) { + if (!ZSTD_TrainDictionarySupported()) { + return Status::InvalidArgument( + "zstd dictionary trainer cannot be used because ZSTD 1.1.3+ " + "is not linked with the binary."); + } + if (cf_options.compression_opts.max_dict_bytes == 0) { + return Status::InvalidArgument( + "The dictionary size limit (`CompressionOptions::max_dict_bytes`) " + "should be nonzero if we're using zstd's dictionary generator."); + } + } + return Status::OK(); +} + +Status CheckConcurrentWritesSupported(const ColumnFamilyOptions& cf_options) { + if (cf_options.inplace_update_support) { + return Status::InvalidArgument( + "In-place memtable updates (inplace_update_support) is not compatible " + "with concurrent writes (allow_concurrent_memtable_write)"); + } + if (!cf_options.memtable_factory->IsInsertConcurrentlySupported()) { + return Status::InvalidArgument( + "Memtable doesn't concurrent writes (allow_concurrent_memtable_write)"); + } + return Status::OK(); +} + +Status CheckCFPathsSupported(const DBOptions& db_options, + const ColumnFamilyOptions& cf_options) { + // More than one cf_paths are supported only in universal + // and level compaction styles. This function also checks the case + // in which cf_paths is not specified, which results in db_paths + // being used. + if ((cf_options.compaction_style != kCompactionStyleUniversal) && + (cf_options.compaction_style != kCompactionStyleLevel)) { + if (cf_options.cf_paths.size() > 1) { + return Status::NotSupported( + "More than one CF paths are only supported in " + "universal and level compaction styles. "); + } else if (cf_options.cf_paths.empty() && + db_options.db_paths.size() > 1) { + return Status::NotSupported( + "More than one DB paths are only supported in " + "universal and level compaction styles. "); + } + } + return Status::OK(); +} + +namespace { +const uint64_t kDefaultTtl = 0xfffffffffffffffe; +const uint64_t kDefaultPeriodicCompSecs = 0xfffffffffffffffe; +}; // namespace + +ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, + const ColumnFamilyOptions& src) { + ColumnFamilyOptions result = src; + size_t clamp_max = std::conditional< + sizeof(size_t) == 4, std::integral_constant, + std::integral_constant>::type::value; + ClipToRange(&result.write_buffer_size, ((size_t)64) << 10, clamp_max); + // if user sets arena_block_size, we trust user to use this value. Otherwise, + // calculate a proper value from writer_buffer_size; + if (result.arena_block_size <= 0) { + result.arena_block_size = result.write_buffer_size / 8; + + // Align up to 4k + const size_t align = 4 * 1024; + result.arena_block_size = + ((result.arena_block_size + align - 1) / align) * align; + } + result.min_write_buffer_number_to_merge = + std::min(result.min_write_buffer_number_to_merge, + result.max_write_buffer_number - 1); + if (result.min_write_buffer_number_to_merge < 1) { + result.min_write_buffer_number_to_merge = 1; + } + + if (result.num_levels < 1) { + result.num_levels = 1; + } + if (result.compaction_style == kCompactionStyleLevel && + result.num_levels < 2) { + result.num_levels = 2; + } + + if (result.compaction_style == kCompactionStyleUniversal && + db_options.allow_ingest_behind && result.num_levels < 3) { + result.num_levels = 3; + } + + if (result.max_write_buffer_number < 2) { + result.max_write_buffer_number = 2; + } + // fall back max_write_buffer_number_to_maintain if + // max_write_buffer_size_to_maintain is not set + if (result.max_write_buffer_size_to_maintain < 0) { + result.max_write_buffer_size_to_maintain = + result.max_write_buffer_number * + static_cast(result.write_buffer_size); + } else if (result.max_write_buffer_size_to_maintain == 0 && + result.max_write_buffer_number_to_maintain < 0) { + result.max_write_buffer_number_to_maintain = result.max_write_buffer_number; + } + // bloom filter size shouldn't exceed 1/4 of memtable size. + if (result.memtable_prefix_bloom_size_ratio > 0.25) { + result.memtable_prefix_bloom_size_ratio = 0.25; + } else if (result.memtable_prefix_bloom_size_ratio < 0) { + result.memtable_prefix_bloom_size_ratio = 0; + } + + if (!result.prefix_extractor) { + assert(result.memtable_factory); + Slice name = result.memtable_factory->Name(); + if (name.compare("HashSkipListRepFactory") == 0 || + name.compare("HashLinkListRepFactory") == 0) { + result.memtable_factory = std::make_shared(); + } + } + + if (result.compaction_style == kCompactionStyleFIFO) { + result.num_levels = 1; + // since we delete level0 files in FIFO compaction when there are too many + // of them, these options don't really mean anything + result.level0_slowdown_writes_trigger = std::numeric_limits::max(); + result.level0_stop_writes_trigger = std::numeric_limits::max(); + } + + if (result.max_bytes_for_level_multiplier <= 0) { + result.max_bytes_for_level_multiplier = 1; + } + + if (result.level0_file_num_compaction_trigger == 0) { + ROCKS_LOG_WARN(db_options.info_log.get(), + "level0_file_num_compaction_trigger cannot be 0"); + result.level0_file_num_compaction_trigger = 1; + } + + if (result.level0_stop_writes_trigger < + result.level0_slowdown_writes_trigger || + result.level0_slowdown_writes_trigger < + result.level0_file_num_compaction_trigger) { + ROCKS_LOG_WARN(db_options.info_log.get(), + "This condition must be satisfied: " + "level0_stop_writes_trigger(%d) >= " + "level0_slowdown_writes_trigger(%d) >= " + "level0_file_num_compaction_trigger(%d)", + result.level0_stop_writes_trigger, + result.level0_slowdown_writes_trigger, + result.level0_file_num_compaction_trigger); + if (result.level0_slowdown_writes_trigger < + result.level0_file_num_compaction_trigger) { + result.level0_slowdown_writes_trigger = + result.level0_file_num_compaction_trigger; + } + if (result.level0_stop_writes_trigger < + result.level0_slowdown_writes_trigger) { + result.level0_stop_writes_trigger = result.level0_slowdown_writes_trigger; + } + ROCKS_LOG_WARN(db_options.info_log.get(), + "Adjust the value to " + "level0_stop_writes_trigger(%d)" + "level0_slowdown_writes_trigger(%d)" + "level0_file_num_compaction_trigger(%d)", + result.level0_stop_writes_trigger, + result.level0_slowdown_writes_trigger, + result.level0_file_num_compaction_trigger); + } + + if (result.soft_pending_compaction_bytes_limit == 0) { + result.soft_pending_compaction_bytes_limit = + result.hard_pending_compaction_bytes_limit; + } else if (result.hard_pending_compaction_bytes_limit > 0 && + result.soft_pending_compaction_bytes_limit > + result.hard_pending_compaction_bytes_limit) { + result.soft_pending_compaction_bytes_limit = + result.hard_pending_compaction_bytes_limit; + } + +#ifndef ROCKSDB_LITE + // When the DB is stopped, it's possible that there are some .trash files that + // were not deleted yet, when we open the DB we will find these .trash files + // and schedule them to be deleted (or delete immediately if SstFileManager + // was not used) + auto sfm = static_cast(db_options.sst_file_manager.get()); + for (size_t i = 0; i < result.cf_paths.size(); i++) { + DeleteScheduler::CleanupDirectory(db_options.env, sfm, result.cf_paths[i].path); + } +#endif + + if (result.cf_paths.empty()) { + result.cf_paths = db_options.db_paths; + } + + if (result.level_compaction_dynamic_level_bytes) { + if (result.compaction_style != kCompactionStyleLevel || + result.cf_paths.size() > 1U) { + // 1. level_compaction_dynamic_level_bytes only makes sense for + // level-based compaction. + // 2. we don't yet know how to make both of this feature and multiple + // DB path work. + result.level_compaction_dynamic_level_bytes = false; + } + } + + if (result.max_compaction_bytes == 0) { + result.max_compaction_bytes = result.target_file_size_base * 25; + } + + bool is_block_based_table = + (result.table_factory->Name() == BlockBasedTableFactory().Name()); + + const uint64_t kAdjustedTtl = 30 * 24 * 60 * 60; + if (result.ttl == kDefaultTtl) { + if (is_block_based_table && + result.compaction_style != kCompactionStyleFIFO) { + result.ttl = kAdjustedTtl; + } else { + result.ttl = 0; + } + } + + const uint64_t kAdjustedPeriodicCompSecs = 30 * 24 * 60 * 60; + + // Turn on periodic compactions and set them to occur once every 30 days if + // compaction filters are used and periodic_compaction_seconds is set to the + // default value. + if (result.compaction_style != kCompactionStyleFIFO) { + if ((result.compaction_filter != nullptr || + result.compaction_filter_factory != nullptr) && + result.periodic_compaction_seconds == kDefaultPeriodicCompSecs && + is_block_based_table) { + result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs; + } + } else { + // result.compaction_style == kCompactionStyleFIFO + if (result.ttl == 0) { + if (is_block_based_table) { + if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) { + result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs; + } + result.ttl = result.periodic_compaction_seconds; + } + } else if (result.periodic_compaction_seconds != 0) { + result.ttl = std::min(result.ttl, result.periodic_compaction_seconds); + } + } + + // TTL compactions would work similar to Periodic Compactions in Universal in + // most of the cases. So, if ttl is set, execute the periodic compaction + // codepath. + if (result.compaction_style == kCompactionStyleUniversal && result.ttl != 0) { + if (result.periodic_compaction_seconds != 0) { + result.periodic_compaction_seconds = + std::min(result.ttl, result.periodic_compaction_seconds); + } else { + result.periodic_compaction_seconds = result.ttl; + } + } + + if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) { + result.periodic_compaction_seconds = 0; + } + + return result; +} + +int SuperVersion::dummy = 0; +void* const SuperVersion::kSVInUse = &SuperVersion::dummy; +void* const SuperVersion::kSVObsolete = nullptr; + +SuperVersion::~SuperVersion() { + for (auto td : to_delete) { + delete td; + } +} + +SuperVersion* SuperVersion::Ref() { + refs.fetch_add(1, std::memory_order_relaxed); + return this; +} + +bool SuperVersion::Unref() { + // fetch_sub returns the previous value of ref + uint32_t previous_refs = refs.fetch_sub(1); + assert(previous_refs > 0); + return previous_refs == 1; +} + +void SuperVersion::Cleanup() { + assert(refs.load(std::memory_order_relaxed) == 0); + imm->Unref(&to_delete); + MemTable* m = mem->Unref(); + if (m != nullptr) { + auto* memory_usage = current->cfd()->imm()->current_memory_usage(); + assert(*memory_usage >= m->ApproximateMemoryUsage()); + *memory_usage -= m->ApproximateMemoryUsage(); + to_delete.push_back(m); + } + current->Unref(); + if (cfd->Unref()) { + delete cfd; + } +} + +void SuperVersion::Init(ColumnFamilyData* new_cfd, MemTable* new_mem, + MemTableListVersion* new_imm, Version* new_current) { + cfd = new_cfd; + mem = new_mem; + imm = new_imm; + current = new_current; + cfd->Ref(); + mem->Ref(); + imm->Ref(); + current->Ref(); + refs.store(1, std::memory_order_relaxed); +} + +namespace { +void SuperVersionUnrefHandle(void* ptr) { + // UnrefHandle is called when a thread exists or a ThreadLocalPtr gets + // destroyed. When former happens, the thread shouldn't see kSVInUse. + // When latter happens, we are in ~ColumnFamilyData(), no get should happen as + // well. + SuperVersion* sv = static_cast(ptr); + bool was_last_ref __attribute__((__unused__)); + was_last_ref = sv->Unref(); + // Thread-local SuperVersions can't outlive ColumnFamilyData::super_version_. + // This is important because we can't do SuperVersion cleanup here. + // That would require locking DB mutex, which would deadlock because + // SuperVersionUnrefHandle is called with locked ThreadLocalPtr mutex. + assert(!was_last_ref); +} +} // anonymous namespace + +ColumnFamilyData::ColumnFamilyData( + uint32_t id, const std::string& name, Version* _dummy_versions, + Cache* _table_cache, WriteBufferManager* write_buffer_manager, + const ColumnFamilyOptions& cf_options, const ImmutableDBOptions& db_options, + const FileOptions& file_options, ColumnFamilySet* column_family_set, + BlockCacheTracer* const block_cache_tracer) + : id_(id), + name_(name), + dummy_versions_(_dummy_versions), + current_(nullptr), + refs_(0), + initialized_(false), + dropped_(false), + internal_comparator_(cf_options.comparator), + initial_cf_options_(SanitizeOptions(db_options, cf_options)), + ioptions_(db_options, initial_cf_options_), + mutable_cf_options_(initial_cf_options_), + is_delete_range_supported_( + cf_options.table_factory->IsDeleteRangeSupported()), + write_buffer_manager_(write_buffer_manager), + mem_(nullptr), + imm_(ioptions_.min_write_buffer_number_to_merge, + ioptions_.max_write_buffer_number_to_maintain, + ioptions_.max_write_buffer_size_to_maintain), + super_version_(nullptr), + super_version_number_(0), + local_sv_(new ThreadLocalPtr(&SuperVersionUnrefHandle)), + next_(nullptr), + prev_(nullptr), + log_number_(0), + flush_reason_(FlushReason::kOthers), + column_family_set_(column_family_set), + queued_for_flush_(false), + queued_for_compaction_(false), + prev_compaction_needed_bytes_(0), + allow_2pc_(db_options.allow_2pc), + last_memtable_id_(0) { + Ref(); + + // Convert user defined table properties collector factories to internal ones. + GetIntTblPropCollectorFactory(ioptions_, &int_tbl_prop_collector_factories_); + + // if _dummy_versions is nullptr, then this is a dummy column family. + if (_dummy_versions != nullptr) { + internal_stats_.reset( + new InternalStats(ioptions_.num_levels, db_options.env, this)); + table_cache_.reset(new TableCache(ioptions_, file_options, _table_cache, + block_cache_tracer)); + if (ioptions_.compaction_style == kCompactionStyleLevel) { + compaction_picker_.reset( + new LevelCompactionPicker(ioptions_, &internal_comparator_)); +#ifndef ROCKSDB_LITE + } else if (ioptions_.compaction_style == kCompactionStyleUniversal) { + compaction_picker_.reset( + new UniversalCompactionPicker(ioptions_, &internal_comparator_)); + } else if (ioptions_.compaction_style == kCompactionStyleFIFO) { + compaction_picker_.reset( + new FIFOCompactionPicker(ioptions_, &internal_comparator_)); + } else if (ioptions_.compaction_style == kCompactionStyleNone) { + compaction_picker_.reset(new NullCompactionPicker( + ioptions_, &internal_comparator_)); + ROCKS_LOG_WARN(ioptions_.info_log, + "Column family %s does not use any background compaction. " + "Compactions can only be done via CompactFiles\n", + GetName().c_str()); +#endif // !ROCKSDB_LITE + } else { + ROCKS_LOG_ERROR(ioptions_.info_log, + "Unable to recognize the specified compaction style %d. " + "Column family %s will use kCompactionStyleLevel.\n", + ioptions_.compaction_style, GetName().c_str()); + compaction_picker_.reset( + new LevelCompactionPicker(ioptions_, &internal_comparator_)); + } + + if (column_family_set_->NumberOfColumnFamilies() < 10) { + ROCKS_LOG_INFO(ioptions_.info_log, + "--------------- Options for column family [%s]:\n", + name.c_str()); + initial_cf_options_.Dump(ioptions_.info_log); + } else { + ROCKS_LOG_INFO(ioptions_.info_log, "\t(skipping printing options)\n"); + } + } + + RecalculateWriteStallConditions(mutable_cf_options_); +} + +// DB mutex held +ColumnFamilyData::~ColumnFamilyData() { + assert(refs_.load(std::memory_order_relaxed) == 0); + // remove from linked list + auto prev = prev_; + auto next = next_; + prev->next_ = next; + next->prev_ = prev; + + if (!dropped_ && column_family_set_ != nullptr) { + // If it's dropped, it's already removed from column family set + // If column_family_set_ == nullptr, this is dummy CFD and not in + // ColumnFamilySet + column_family_set_->RemoveColumnFamily(this); + } + + if (current_ != nullptr) { + current_->Unref(); + } + + // It would be wrong if this ColumnFamilyData is in flush_queue_ or + // compaction_queue_ and we destroyed it + assert(!queued_for_flush_); + assert(!queued_for_compaction_); + assert(super_version_ == nullptr); + + if (dummy_versions_ != nullptr) { + // List must be empty + assert(dummy_versions_->TEST_Next() == dummy_versions_); + bool deleted __attribute__((__unused__)); + deleted = dummy_versions_->Unref(); + assert(deleted); + } + + if (mem_ != nullptr) { + delete mem_->Unref(); + } + autovector to_delete; + imm_.current()->Unref(&to_delete); + for (MemTable* m : to_delete) { + delete m; + } +} + +bool ColumnFamilyData::UnrefAndTryDelete() { + int old_refs = refs_.fetch_sub(1); + assert(old_refs > 0); + + if (old_refs == 1) { + assert(super_version_ == nullptr); + delete this; + return true; + } + + if (old_refs == 2 && super_version_ != nullptr) { + // Only the super_version_ holds me + SuperVersion* sv = super_version_; + super_version_ = nullptr; + // Release SuperVersion reference kept in ThreadLocalPtr. + // This must be done outside of mutex_ since unref handler can lock mutex. + sv->db_mutex->Unlock(); + local_sv_.reset(); + sv->db_mutex->Lock(); + + if (sv->Unref()) { + // May delete this ColumnFamilyData after calling Cleanup() + sv->Cleanup(); + delete sv; + return true; + } + } + return false; +} + +void ColumnFamilyData::SetDropped() { + // can't drop default CF + assert(id_ != 0); + dropped_ = true; + write_controller_token_.reset(); + + // remove from column_family_set + column_family_set_->RemoveColumnFamily(this); +} + +ColumnFamilyOptions ColumnFamilyData::GetLatestCFOptions() const { + return BuildColumnFamilyOptions(initial_cf_options_, mutable_cf_options_); +} + +uint64_t ColumnFamilyData::OldestLogToKeep() { + auto current_log = GetLogNumber(); + + if (allow_2pc_) { + autovector empty_list; + auto imm_prep_log = + imm()->PrecomputeMinLogContainingPrepSection(empty_list); + auto mem_prep_log = mem()->GetMinLogContainingPrepSection(); + + if (imm_prep_log > 0 && imm_prep_log < current_log) { + current_log = imm_prep_log; + } + + if (mem_prep_log > 0 && mem_prep_log < current_log) { + current_log = mem_prep_log; + } + } + + return current_log; +} + +const double kIncSlowdownRatio = 0.8; +const double kDecSlowdownRatio = 1 / kIncSlowdownRatio; +const double kNearStopSlowdownRatio = 0.6; +const double kDelayRecoverSlowdownRatio = 1.4; + +namespace { +// If penalize_stop is true, we further reduce slowdown rate. +std::unique_ptr SetupDelay( + WriteController* write_controller, uint64_t compaction_needed_bytes, + uint64_t prev_compaction_need_bytes, bool penalize_stop, + bool auto_comapctions_disabled) { + const uint64_t kMinWriteRate = 16 * 1024u; // Minimum write rate 16KB/s. + + uint64_t max_write_rate = write_controller->max_delayed_write_rate(); + uint64_t write_rate = write_controller->delayed_write_rate(); + + if (auto_comapctions_disabled) { + // When auto compaction is disabled, always use the value user gave. + write_rate = max_write_rate; + } else if (write_controller->NeedsDelay() && max_write_rate > kMinWriteRate) { + // If user gives rate less than kMinWriteRate, don't adjust it. + // + // If already delayed, need to adjust based on previous compaction debt. + // When there are two or more column families require delay, we always + // increase or reduce write rate based on information for one single + // column family. It is likely to be OK but we can improve if there is a + // problem. + // Ignore compaction_needed_bytes = 0 case because compaction_needed_bytes + // is only available in level-based compaction + // + // If the compaction debt stays the same as previously, we also further slow + // down. It usually means a mem table is full. It's mainly for the case + // where both of flush and compaction are much slower than the speed we + // insert to mem tables, so we need to actively slow down before we get + // feedback signal from compaction and flushes to avoid the full stop + // because of hitting the max write buffer number. + // + // If DB just falled into the stop condition, we need to further reduce + // the write rate to avoid the stop condition. + if (penalize_stop) { + // Penalize the near stop or stop condition by more aggressive slowdown. + // This is to provide the long term slowdown increase signal. + // The penalty is more than the reward of recovering to the normal + // condition. + write_rate = static_cast(static_cast(write_rate) * + kNearStopSlowdownRatio); + if (write_rate < kMinWriteRate) { + write_rate = kMinWriteRate; + } + } else if (prev_compaction_need_bytes > 0 && + prev_compaction_need_bytes <= compaction_needed_bytes) { + write_rate = static_cast(static_cast(write_rate) * + kIncSlowdownRatio); + if (write_rate < kMinWriteRate) { + write_rate = kMinWriteRate; + } + } else if (prev_compaction_need_bytes > compaction_needed_bytes) { + // We are speeding up by ratio of kSlowdownRatio when we have paid + // compaction debt. But we'll never speed up to faster than the write rate + // given by users. + write_rate = static_cast(static_cast(write_rate) * + kDecSlowdownRatio); + if (write_rate > max_write_rate) { + write_rate = max_write_rate; + } + } + } + return write_controller->GetDelayToken(write_rate); +} + +int GetL0ThresholdSpeedupCompaction(int level0_file_num_compaction_trigger, + int level0_slowdown_writes_trigger) { + // SanitizeOptions() ensures it. + assert(level0_file_num_compaction_trigger <= level0_slowdown_writes_trigger); + + if (level0_file_num_compaction_trigger < 0) { + return std::numeric_limits::max(); + } + + const int64_t twice_level0_trigger = + static_cast(level0_file_num_compaction_trigger) * 2; + + const int64_t one_fourth_trigger_slowdown = + static_cast(level0_file_num_compaction_trigger) + + ((level0_slowdown_writes_trigger - level0_file_num_compaction_trigger) / + 4); + + assert(twice_level0_trigger >= 0); + assert(one_fourth_trigger_slowdown >= 0); + + // 1/4 of the way between L0 compaction trigger threshold and slowdown + // condition. + // Or twice as compaction trigger, if it is smaller. + int64_t res = std::min(twice_level0_trigger, one_fourth_trigger_slowdown); + if (res >= port::kMaxInt32) { + return port::kMaxInt32; + } else { + // res fits in int + return static_cast(res); + } +} +} // namespace + +std::pair +ColumnFamilyData::GetWriteStallConditionAndCause( + int num_unflushed_memtables, int num_l0_files, + uint64_t num_compaction_needed_bytes, + const MutableCFOptions& mutable_cf_options) { + if (num_unflushed_memtables >= mutable_cf_options.max_write_buffer_number) { + return {WriteStallCondition::kStopped, WriteStallCause::kMemtableLimit}; + } else if (!mutable_cf_options.disable_auto_compactions && + num_l0_files >= mutable_cf_options.level0_stop_writes_trigger) { + return {WriteStallCondition::kStopped, WriteStallCause::kL0FileCountLimit}; + } else if (!mutable_cf_options.disable_auto_compactions && + mutable_cf_options.hard_pending_compaction_bytes_limit > 0 && + num_compaction_needed_bytes >= + mutable_cf_options.hard_pending_compaction_bytes_limit) { + return {WriteStallCondition::kStopped, + WriteStallCause::kPendingCompactionBytes}; + } else if (mutable_cf_options.max_write_buffer_number > 3 && + num_unflushed_memtables >= + mutable_cf_options.max_write_buffer_number - 1) { + return {WriteStallCondition::kDelayed, WriteStallCause::kMemtableLimit}; + } else if (!mutable_cf_options.disable_auto_compactions && + mutable_cf_options.level0_slowdown_writes_trigger >= 0 && + num_l0_files >= + mutable_cf_options.level0_slowdown_writes_trigger) { + return {WriteStallCondition::kDelayed, WriteStallCause::kL0FileCountLimit}; + } else if (!mutable_cf_options.disable_auto_compactions && + mutable_cf_options.soft_pending_compaction_bytes_limit > 0 && + num_compaction_needed_bytes >= + mutable_cf_options.soft_pending_compaction_bytes_limit) { + return {WriteStallCondition::kDelayed, + WriteStallCause::kPendingCompactionBytes}; + } + return {WriteStallCondition::kNormal, WriteStallCause::kNone}; +} + +WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( + const MutableCFOptions& mutable_cf_options) { + auto write_stall_condition = WriteStallCondition::kNormal; + if (current_ != nullptr) { + auto* vstorage = current_->storage_info(); + auto write_controller = column_family_set_->write_controller_; + uint64_t compaction_needed_bytes = + vstorage->estimated_compaction_needed_bytes(); + + auto write_stall_condition_and_cause = GetWriteStallConditionAndCause( + imm()->NumNotFlushed(), vstorage->l0_delay_trigger_count(), + vstorage->estimated_compaction_needed_bytes(), mutable_cf_options); + write_stall_condition = write_stall_condition_and_cause.first; + auto write_stall_cause = write_stall_condition_and_cause.second; + + bool was_stopped = write_controller->IsStopped(); + bool needed_delay = write_controller->NeedsDelay(); + + if (write_stall_condition == WriteStallCondition::kStopped && + write_stall_cause == WriteStallCause::kMemtableLimit) { + write_controller_token_ = write_controller->GetStopToken(); + internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_STOPS, 1); + ROCKS_LOG_WARN( + ioptions_.info_log, + "[%s] Stopping writes because we have %d immutable memtables " + "(waiting for flush), max_write_buffer_number is set to %d", + name_.c_str(), imm()->NumNotFlushed(), + mutable_cf_options.max_write_buffer_number); + } else if (write_stall_condition == WriteStallCondition::kStopped && + write_stall_cause == WriteStallCause::kL0FileCountLimit) { + write_controller_token_ = write_controller->GetStopToken(); + internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_STOPS, 1); + if (compaction_picker_->IsLevel0CompactionInProgress()) { + internal_stats_->AddCFStats( + InternalStats::LOCKED_L0_FILE_COUNT_LIMIT_STOPS, 1); + } + ROCKS_LOG_WARN(ioptions_.info_log, + "[%s] Stopping writes because we have %d level-0 files", + name_.c_str(), vstorage->l0_delay_trigger_count()); + } else if (write_stall_condition == WriteStallCondition::kStopped && + write_stall_cause == WriteStallCause::kPendingCompactionBytes) { + write_controller_token_ = write_controller->GetStopToken(); + internal_stats_->AddCFStats( + InternalStats::PENDING_COMPACTION_BYTES_LIMIT_STOPS, 1); + ROCKS_LOG_WARN( + ioptions_.info_log, + "[%s] Stopping writes because of estimated pending compaction " + "bytes %" PRIu64, + name_.c_str(), compaction_needed_bytes); + } else if (write_stall_condition == WriteStallCondition::kDelayed && + write_stall_cause == WriteStallCause::kMemtableLimit) { + write_controller_token_ = + SetupDelay(write_controller, compaction_needed_bytes, + prev_compaction_needed_bytes_, was_stopped, + mutable_cf_options.disable_auto_compactions); + internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_SLOWDOWNS, 1); + ROCKS_LOG_WARN( + ioptions_.info_log, + "[%s] Stalling writes because we have %d immutable memtables " + "(waiting for flush), max_write_buffer_number is set to %d " + "rate %" PRIu64, + name_.c_str(), imm()->NumNotFlushed(), + mutable_cf_options.max_write_buffer_number, + write_controller->delayed_write_rate()); + } else if (write_stall_condition == WriteStallCondition::kDelayed && + write_stall_cause == WriteStallCause::kL0FileCountLimit) { + // L0 is the last two files from stopping. + bool near_stop = vstorage->l0_delay_trigger_count() >= + mutable_cf_options.level0_stop_writes_trigger - 2; + write_controller_token_ = + SetupDelay(write_controller, compaction_needed_bytes, + prev_compaction_needed_bytes_, was_stopped || near_stop, + mutable_cf_options.disable_auto_compactions); + internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_SLOWDOWNS, + 1); + if (compaction_picker_->IsLevel0CompactionInProgress()) { + internal_stats_->AddCFStats( + InternalStats::LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS, 1); + } + ROCKS_LOG_WARN(ioptions_.info_log, + "[%s] Stalling writes because we have %d level-0 files " + "rate %" PRIu64, + name_.c_str(), vstorage->l0_delay_trigger_count(), + write_controller->delayed_write_rate()); + } else if (write_stall_condition == WriteStallCondition::kDelayed && + write_stall_cause == WriteStallCause::kPendingCompactionBytes) { + // If the distance to hard limit is less than 1/4 of the gap between soft + // and + // hard bytes limit, we think it is near stop and speed up the slowdown. + bool near_stop = + mutable_cf_options.hard_pending_compaction_bytes_limit > 0 && + (compaction_needed_bytes - + mutable_cf_options.soft_pending_compaction_bytes_limit) > + 3 * (mutable_cf_options.hard_pending_compaction_bytes_limit - + mutable_cf_options.soft_pending_compaction_bytes_limit) / + 4; + + write_controller_token_ = + SetupDelay(write_controller, compaction_needed_bytes, + prev_compaction_needed_bytes_, was_stopped || near_stop, + mutable_cf_options.disable_auto_compactions); + internal_stats_->AddCFStats( + InternalStats::PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS, 1); + ROCKS_LOG_WARN( + ioptions_.info_log, + "[%s] Stalling writes because of estimated pending compaction " + "bytes %" PRIu64 " rate %" PRIu64, + name_.c_str(), vstorage->estimated_compaction_needed_bytes(), + write_controller->delayed_write_rate()); + } else { + assert(write_stall_condition == WriteStallCondition::kNormal); + if (vstorage->l0_delay_trigger_count() >= + GetL0ThresholdSpeedupCompaction( + mutable_cf_options.level0_file_num_compaction_trigger, + mutable_cf_options.level0_slowdown_writes_trigger)) { + write_controller_token_ = + write_controller->GetCompactionPressureToken(); + ROCKS_LOG_INFO( + ioptions_.info_log, + "[%s] Increasing compaction threads because we have %d level-0 " + "files ", + name_.c_str(), vstorage->l0_delay_trigger_count()); + } else if (vstorage->estimated_compaction_needed_bytes() >= + mutable_cf_options.soft_pending_compaction_bytes_limit / 4) { + // Increase compaction threads if bytes needed for compaction exceeds + // 1/4 of threshold for slowing down. + // If soft pending compaction byte limit is not set, always speed up + // compaction. + write_controller_token_ = + write_controller->GetCompactionPressureToken(); + if (mutable_cf_options.soft_pending_compaction_bytes_limit > 0) { + ROCKS_LOG_INFO( + ioptions_.info_log, + "[%s] Increasing compaction threads because of estimated pending " + "compaction " + "bytes %" PRIu64, + name_.c_str(), vstorage->estimated_compaction_needed_bytes()); + } + } else { + write_controller_token_.reset(); + } + // If the DB recovers from delay conditions, we reward with reducing + // double the slowdown ratio. This is to balance the long term slowdown + // increase signal. + if (needed_delay) { + uint64_t write_rate = write_controller->delayed_write_rate(); + write_controller->set_delayed_write_rate(static_cast( + static_cast(write_rate) * kDelayRecoverSlowdownRatio)); + // Set the low pri limit to be 1/4 the delayed write rate. + // Note we don't reset this value even after delay condition is relased. + // Low-pri rate will continue to apply if there is a compaction + // pressure. + write_controller->low_pri_rate_limiter()->SetBytesPerSecond(write_rate / + 4); + } + } + prev_compaction_needed_bytes_ = compaction_needed_bytes; + } + return write_stall_condition; +} + +const FileOptions* ColumnFamilyData::soptions() const { + return &(column_family_set_->file_options_); +} + +void ColumnFamilyData::SetCurrent(Version* current_version) { + current_ = current_version; +} + +uint64_t ColumnFamilyData::GetNumLiveVersions() const { + return VersionSet::GetNumLiveVersions(dummy_versions_); +} + +uint64_t ColumnFamilyData::GetTotalSstFilesSize() const { + return VersionSet::GetTotalSstFilesSize(dummy_versions_); +} + +uint64_t ColumnFamilyData::GetLiveSstFilesSize() const { + return current_->GetSstFilesSize(); +} + +MemTable* ColumnFamilyData::ConstructNewMemtable( + const MutableCFOptions& mutable_cf_options, SequenceNumber earliest_seq) { + return new MemTable(internal_comparator_, ioptions_, mutable_cf_options, + write_buffer_manager_, earliest_seq, id_); +} + +void ColumnFamilyData::CreateNewMemtable( + const MutableCFOptions& mutable_cf_options, SequenceNumber earliest_seq) { + if (mem_ != nullptr) { + delete mem_->Unref(); + } + SetMemtable(ConstructNewMemtable(mutable_cf_options, earliest_seq)); + mem_->Ref(); +} + +bool ColumnFamilyData::NeedsCompaction() const { + return compaction_picker_->NeedsCompaction(current_->storage_info()); +} + +Compaction* ColumnFamilyData::PickCompaction( + const MutableCFOptions& mutable_options, LogBuffer* log_buffer) { + SequenceNumber earliest_mem_seqno = + std::min(mem_->GetEarliestSequenceNumber(), + imm_.current()->GetEarliestSequenceNumber(false)); + auto* result = compaction_picker_->PickCompaction( + GetName(), mutable_options, current_->storage_info(), log_buffer, + earliest_mem_seqno); + if (result != nullptr) { + result->SetInputVersion(current_); + } + return result; +} + +bool ColumnFamilyData::RangeOverlapWithCompaction( + const Slice& smallest_user_key, const Slice& largest_user_key, + int level) const { + return compaction_picker_->RangeOverlapWithCompaction( + smallest_user_key, largest_user_key, level); +} + +Status ColumnFamilyData::RangesOverlapWithMemtables( + const autovector& ranges, SuperVersion* super_version, + bool* overlap) { + assert(overlap != nullptr); + *overlap = false; + // Create an InternalIterator over all unflushed memtables + Arena arena; + ReadOptions read_opts; + read_opts.total_order_seek = true; + MergeIteratorBuilder merge_iter_builder(&internal_comparator_, &arena); + merge_iter_builder.AddIterator( + super_version->mem->NewIterator(read_opts, &arena)); + super_version->imm->AddIterators(read_opts, &merge_iter_builder); + ScopedArenaIterator memtable_iter(merge_iter_builder.Finish()); + + auto read_seq = super_version->current->version_set()->LastSequence(); + ReadRangeDelAggregator range_del_agg(&internal_comparator_, read_seq); + auto* active_range_del_iter = + super_version->mem->NewRangeTombstoneIterator(read_opts, read_seq); + range_del_agg.AddTombstones( + std::unique_ptr(active_range_del_iter)); + super_version->imm->AddRangeTombstoneIterators(read_opts, nullptr /* arena */, + &range_del_agg); + + Status status; + for (size_t i = 0; i < ranges.size() && status.ok() && !*overlap; ++i) { + auto* vstorage = super_version->current->storage_info(); + auto* ucmp = vstorage->InternalComparator()->user_comparator(); + InternalKey range_start(ranges[i].start, kMaxSequenceNumber, + kValueTypeForSeek); + memtable_iter->Seek(range_start.Encode()); + status = memtable_iter->status(); + ParsedInternalKey seek_result; + if (status.ok()) { + if (memtable_iter->Valid() && + !ParseInternalKey(memtable_iter->key(), &seek_result)) { + status = Status::Corruption("DB have corrupted keys"); + } + } + if (status.ok()) { + if (memtable_iter->Valid() && + ucmp->Compare(seek_result.user_key, ranges[i].limit) <= 0) { + *overlap = true; + } else if (range_del_agg.IsRangeOverlapped(ranges[i].start, + ranges[i].limit)) { + *overlap = true; + } + } + } + return status; +} + +const int ColumnFamilyData::kCompactAllLevels = -1; +const int ColumnFamilyData::kCompactToBaseLevel = -2; + +Compaction* ColumnFamilyData::CompactRange( + const MutableCFOptions& mutable_cf_options, int input_level, + int output_level, const CompactRangeOptions& compact_range_options, + const InternalKey* begin, const InternalKey* end, + InternalKey** compaction_end, bool* conflict, + uint64_t max_file_num_to_ignore) { + auto* result = compaction_picker_->CompactRange( + GetName(), mutable_cf_options, current_->storage_info(), input_level, + output_level, compact_range_options, begin, end, compaction_end, conflict, + max_file_num_to_ignore); + if (result != nullptr) { + result->SetInputVersion(current_); + } + return result; +} + +SuperVersion* ColumnFamilyData::GetReferencedSuperVersion(DBImpl* db) { + SuperVersion* sv = GetThreadLocalSuperVersion(db); + sv->Ref(); + if (!ReturnThreadLocalSuperVersion(sv)) { + // This Unref() corresponds to the Ref() in GetThreadLocalSuperVersion() + // when the thread-local pointer was populated. So, the Ref() earlier in + // this function still prevents the returned SuperVersion* from being + // deleted out from under the caller. + sv->Unref(); + } + return sv; +} + +SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(DBImpl* db) { + // The SuperVersion is cached in thread local storage to avoid acquiring + // mutex when SuperVersion does not change since the last use. When a new + // SuperVersion is installed, the compaction or flush thread cleans up + // cached SuperVersion in all existing thread local storage. To avoid + // acquiring mutex for this operation, we use atomic Swap() on the thread + // local pointer to guarantee exclusive access. If the thread local pointer + // is being used while a new SuperVersion is installed, the cached + // SuperVersion can become stale. In that case, the background thread would + // have swapped in kSVObsolete. We re-check the value at when returning + // SuperVersion back to thread local, with an atomic compare and swap. + // The superversion will need to be released if detected to be stale. + void* ptr = local_sv_->Swap(SuperVersion::kSVInUse); + // Invariant: + // (1) Scrape (always) installs kSVObsolete in ThreadLocal storage + // (2) the Swap above (always) installs kSVInUse, ThreadLocal storage + // should only keep kSVInUse before ReturnThreadLocalSuperVersion call + // (if no Scrape happens). + assert(ptr != SuperVersion::kSVInUse); + SuperVersion* sv = static_cast(ptr); + if (sv == SuperVersion::kSVObsolete || + sv->version_number != super_version_number_.load()) { + RecordTick(ioptions_.statistics, NUMBER_SUPERVERSION_ACQUIRES); + SuperVersion* sv_to_delete = nullptr; + + if (sv && sv->Unref()) { + RecordTick(ioptions_.statistics, NUMBER_SUPERVERSION_CLEANUPS); + db->mutex()->Lock(); + // NOTE: underlying resources held by superversion (sst files) might + // not be released until the next background job. + sv->Cleanup(); + if (db->immutable_db_options().avoid_unnecessary_blocking_io) { + db->AddSuperVersionsToFreeQueue(sv); + db->SchedulePurge(); + } else { + sv_to_delete = sv; + } + } else { + db->mutex()->Lock(); + } + sv = super_version_->Ref(); + db->mutex()->Unlock(); + + delete sv_to_delete; + } + assert(sv != nullptr); + return sv; +} + +bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) { + assert(sv != nullptr); + // Put the SuperVersion back + void* expected = SuperVersion::kSVInUse; + if (local_sv_->CompareAndSwap(static_cast(sv), expected)) { + // When we see kSVInUse in the ThreadLocal, we are sure ThreadLocal + // storage has not been altered and no Scrape has happened. The + // SuperVersion is still current. + return true; + } else { + // ThreadLocal scrape happened in the process of this GetImpl call (after + // thread local Swap() at the beginning and before CompareAndSwap()). + // This means the SuperVersion it holds is obsolete. + assert(expected == SuperVersion::kSVObsolete); + } + return false; +} + +void ColumnFamilyData::InstallSuperVersion( + SuperVersionContext* sv_context, InstrumentedMutex* db_mutex) { + db_mutex->AssertHeld(); + return InstallSuperVersion(sv_context, db_mutex, mutable_cf_options_); +} + +void ColumnFamilyData::InstallSuperVersion( + SuperVersionContext* sv_context, InstrumentedMutex* db_mutex, + const MutableCFOptions& mutable_cf_options) { + SuperVersion* new_superversion = sv_context->new_superversion.release(); + new_superversion->db_mutex = db_mutex; + new_superversion->mutable_cf_options = mutable_cf_options; + new_superversion->Init(this, mem_, imm_.current(), current_); + SuperVersion* old_superversion = super_version_; + super_version_ = new_superversion; + ++super_version_number_; + super_version_->version_number = super_version_number_; + super_version_->write_stall_condition = + RecalculateWriteStallConditions(mutable_cf_options); + + if (old_superversion != nullptr) { + // Reset SuperVersions cached in thread local storage. + // This should be done before old_superversion->Unref(). That's to ensure + // that local_sv_ never holds the last reference to SuperVersion, since + // it has no means to safely do SuperVersion cleanup. + ResetThreadLocalSuperVersions(); + + if (old_superversion->mutable_cf_options.write_buffer_size != + mutable_cf_options.write_buffer_size) { + mem_->UpdateWriteBufferSize(mutable_cf_options.write_buffer_size); + } + if (old_superversion->write_stall_condition != + new_superversion->write_stall_condition) { + sv_context->PushWriteStallNotification( + old_superversion->write_stall_condition, + new_superversion->write_stall_condition, GetName(), ioptions()); + } + if (old_superversion->Unref()) { + old_superversion->Cleanup(); + sv_context->superversions_to_free.push_back(old_superversion); + } + } +} + +void ColumnFamilyData::ResetThreadLocalSuperVersions() { + autovector sv_ptrs; + local_sv_->Scrape(&sv_ptrs, SuperVersion::kSVObsolete); + for (auto ptr : sv_ptrs) { + assert(ptr); + if (ptr == SuperVersion::kSVInUse) { + continue; + } + auto sv = static_cast(ptr); + bool was_last_ref __attribute__((__unused__)); + was_last_ref = sv->Unref(); + // sv couldn't have been the last reference because + // ResetThreadLocalSuperVersions() is called before + // unref'ing super_version_. + assert(!was_last_ref); + } +} + +Status ColumnFamilyData::ValidateOptions( + const DBOptions& db_options, const ColumnFamilyOptions& cf_options) { + Status s; + s = CheckCompressionSupported(cf_options); + if (s.ok() && db_options.allow_concurrent_memtable_write) { + s = CheckConcurrentWritesSupported(cf_options); + } + if (s.ok() && db_options.unordered_write && + cf_options.max_successive_merges != 0) { + s = Status::InvalidArgument( + "max_successive_merges > 0 is incompatible with unordered_write"); + } + if (s.ok()) { + s = CheckCFPathsSupported(db_options, cf_options); + } + if (!s.ok()) { + return s; + } + + if (cf_options.ttl > 0 && cf_options.ttl != kDefaultTtl) { + if (cf_options.table_factory->Name() != BlockBasedTableFactory().Name()) { + return Status::NotSupported( + "TTL is only supported in Block-Based Table format. "); + } + } + + if (cf_options.periodic_compaction_seconds > 0 && + cf_options.periodic_compaction_seconds != kDefaultPeriodicCompSecs) { + if (cf_options.table_factory->Name() != BlockBasedTableFactory().Name()) { + return Status::NotSupported( + "Periodic Compaction is only supported in " + "Block-Based Table format. "); + } + } + return s; +} + +#ifndef ROCKSDB_LITE +Status ColumnFamilyData::SetOptions( + const DBOptions& db_options, + const std::unordered_map& options_map) { + MutableCFOptions new_mutable_cf_options; + Status s = + GetMutableOptionsFromStrings(mutable_cf_options_, options_map, + ioptions_.info_log, &new_mutable_cf_options); + if (s.ok()) { + ColumnFamilyOptions cf_options = + BuildColumnFamilyOptions(initial_cf_options_, new_mutable_cf_options); + s = ValidateOptions(db_options, cf_options); + } + if (s.ok()) { + mutable_cf_options_ = new_mutable_cf_options; + mutable_cf_options_.RefreshDerivedOptions(ioptions_); + } + return s; +} +#endif // ROCKSDB_LITE + +// REQUIRES: DB mutex held +Env::WriteLifeTimeHint ColumnFamilyData::CalculateSSTWriteHint(int level) { + if (initial_cf_options_.compaction_style != kCompactionStyleLevel) { + return Env::WLTH_NOT_SET; + } + if (level == 0) { + return Env::WLTH_MEDIUM; + } + int base_level = current_->storage_info()->base_level(); + + // L1: medium, L2: long, ... + if (level - base_level >= 2) { + return Env::WLTH_EXTREME; + } else if (level < base_level) { + // There is no restriction which prevents level passed in to be smaller + // than base_level. + return Env::WLTH_MEDIUM; + } + return static_cast(level - base_level + + static_cast(Env::WLTH_MEDIUM)); +} + +Status ColumnFamilyData::AddDirectories( + std::map>* created_dirs) { + Status s; + assert(created_dirs != nullptr); + assert(data_dirs_.empty()); + for (auto& p : ioptions_.cf_paths) { + auto existing_dir = created_dirs->find(p.path); + + if (existing_dir == created_dirs->end()) { + std::unique_ptr path_directory; + s = DBImpl::CreateAndNewDirectory(ioptions_.env, p.path, &path_directory); + if (!s.ok()) { + return s; + } + assert(path_directory != nullptr); + data_dirs_.emplace_back(path_directory.release()); + (*created_dirs)[p.path] = data_dirs_.back(); + } else { + data_dirs_.emplace_back(existing_dir->second); + } + } + assert(data_dirs_.size() == ioptions_.cf_paths.size()); + return s; +} + +Directory* ColumnFamilyData::GetDataDir(size_t path_id) const { + if (data_dirs_.empty()) { + return nullptr; + } + + assert(path_id < data_dirs_.size()); + return data_dirs_[path_id].get(); +} + +ColumnFamilySet::ColumnFamilySet(const std::string& dbname, + const ImmutableDBOptions* db_options, + const FileOptions& file_options, + Cache* table_cache, + WriteBufferManager* write_buffer_manager, + WriteController* write_controller, + BlockCacheTracer* const block_cache_tracer) + : max_column_family_(0), + dummy_cfd_(new ColumnFamilyData( + 0, "", nullptr, nullptr, nullptr, ColumnFamilyOptions(), *db_options, + file_options, nullptr, block_cache_tracer)), + default_cfd_cache_(nullptr), + db_name_(dbname), + db_options_(db_options), + file_options_(file_options), + table_cache_(table_cache), + write_buffer_manager_(write_buffer_manager), + write_controller_(write_controller), + block_cache_tracer_(block_cache_tracer) { + // initialize linked list + dummy_cfd_->prev_ = dummy_cfd_; + dummy_cfd_->next_ = dummy_cfd_; +} + +ColumnFamilySet::~ColumnFamilySet() { + while (column_family_data_.size() > 0) { + // cfd destructor will delete itself from column_family_data_ + auto cfd = column_family_data_.begin()->second; + bool last_ref __attribute__((__unused__)); + last_ref = cfd->UnrefAndTryDelete(); + assert(last_ref); + } + bool dummy_last_ref __attribute__((__unused__)); + dummy_last_ref = dummy_cfd_->UnrefAndTryDelete(); + assert(dummy_last_ref); +} + +ColumnFamilyData* ColumnFamilySet::GetDefault() const { + assert(default_cfd_cache_ != nullptr); + return default_cfd_cache_; +} + +ColumnFamilyData* ColumnFamilySet::GetColumnFamily(uint32_t id) const { + auto cfd_iter = column_family_data_.find(id); + if (cfd_iter != column_family_data_.end()) { + return cfd_iter->second; + } else { + return nullptr; + } +} + +ColumnFamilyData* ColumnFamilySet::GetColumnFamily(const std::string& name) + const { + auto cfd_iter = column_families_.find(name); + if (cfd_iter != column_families_.end()) { + auto cfd = GetColumnFamily(cfd_iter->second); + assert(cfd != nullptr); + return cfd; + } else { + return nullptr; + } +} + +uint32_t ColumnFamilySet::GetNextColumnFamilyID() { + return ++max_column_family_; +} + +uint32_t ColumnFamilySet::GetMaxColumnFamily() { return max_column_family_; } + +void ColumnFamilySet::UpdateMaxColumnFamily(uint32_t new_max_column_family) { + max_column_family_ = std::max(new_max_column_family, max_column_family_); +} + +size_t ColumnFamilySet::NumberOfColumnFamilies() const { + return column_families_.size(); +} + +// under a DB mutex AND write thread +ColumnFamilyData* ColumnFamilySet::CreateColumnFamily( + const std::string& name, uint32_t id, Version* dummy_versions, + const ColumnFamilyOptions& options) { + assert(column_families_.find(name) == column_families_.end()); + ColumnFamilyData* new_cfd = new ColumnFamilyData( + id, name, dummy_versions, table_cache_, write_buffer_manager_, options, + *db_options_, file_options_, this, block_cache_tracer_); + column_families_.insert({name, id}); + column_family_data_.insert({id, new_cfd}); + max_column_family_ = std::max(max_column_family_, id); + // add to linked list + new_cfd->next_ = dummy_cfd_; + auto prev = dummy_cfd_->prev_; + new_cfd->prev_ = prev; + prev->next_ = new_cfd; + dummy_cfd_->prev_ = new_cfd; + if (id == 0) { + default_cfd_cache_ = new_cfd; + } + return new_cfd; +} + +// REQUIRES: DB mutex held +void ColumnFamilySet::FreeDeadColumnFamilies() { + autovector to_delete; + for (auto cfd = dummy_cfd_->next_; cfd != dummy_cfd_; cfd = cfd->next_) { + if (cfd->refs_.load(std::memory_order_relaxed) == 0) { + to_delete.push_back(cfd); + } + } + for (auto cfd : to_delete) { + // this is very rare, so it's not a problem that we do it under a mutex + delete cfd; + } +} + +// under a DB mutex AND from a write thread +void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) { + auto cfd_iter = column_family_data_.find(cfd->GetID()); + assert(cfd_iter != column_family_data_.end()); + column_family_data_.erase(cfd_iter); + column_families_.erase(cfd->GetName()); +} + +// under a DB mutex OR from a write thread +bool ColumnFamilyMemTablesImpl::Seek(uint32_t column_family_id) { + if (column_family_id == 0) { + // optimization for common case + current_ = column_family_set_->GetDefault(); + } else { + current_ = column_family_set_->GetColumnFamily(column_family_id); + } + handle_.SetCFD(current_); + return current_ != nullptr; +} + +uint64_t ColumnFamilyMemTablesImpl::GetLogNumber() const { + assert(current_ != nullptr); + return current_->GetLogNumber(); +} + +MemTable* ColumnFamilyMemTablesImpl::GetMemTable() const { + assert(current_ != nullptr); + return current_->mem(); +} + +ColumnFamilyHandle* ColumnFamilyMemTablesImpl::GetColumnFamilyHandle() { + assert(current_ != nullptr); + return &handle_; +} + +uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family) { + uint32_t column_family_id = 0; + if (column_family != nullptr) { + auto cfh = reinterpret_cast(column_family); + column_family_id = cfh->GetID(); + } + return column_family_id; +} + +const Comparator* GetColumnFamilyUserComparator( + ColumnFamilyHandle* column_family) { + if (column_family != nullptr) { + return column_family->GetComparator(); + } + return nullptr; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/column_family.h b/src/rocksdb/db/column_family.h new file mode 100644 index 000000000..fcc8ea2cf --- /dev/null +++ b/src/rocksdb/db/column_family.h @@ -0,0 +1,757 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include +#include +#include + +#include "db/memtable_list.h" +#include "db/table_cache.h" +#include "db/table_properties_collector.h" +#include "db/write_batch_internal.h" +#include "db/write_controller.h" +#include "options/cf_options.h" +#include "rocksdb/compaction_job_stats.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "trace_replay/block_cache_tracer.h" +#include "util/thread_local.h" + +namespace ROCKSDB_NAMESPACE { + +class Version; +class VersionSet; +class VersionStorageInfo; +class MemTable; +class MemTableListVersion; +class CompactionPicker; +class Compaction; +class InternalKey; +class InternalStats; +class ColumnFamilyData; +class DBImpl; +class LogBuffer; +class InstrumentedMutex; +class InstrumentedMutexLock; +struct SuperVersionContext; + +extern const double kIncSlowdownRatio; +// This file contains a list of data structures for managing column family +// level metadata. +// +// The basic relationships among classes declared here are illustrated as +// following: +// +// +----------------------+ +----------------------+ +--------+ +// +---+ ColumnFamilyHandle 1 | +--+ ColumnFamilyHandle 2 | | DBImpl | +// | +----------------------+ | +----------------------+ +----+---+ +// | +--------------------------+ | +// | | +-----------------------------+ +// | | | +// | | +-----------------------------v-------------------------------+ +// | | | | +// | | | ColumnFamilySet | +// | | | | +// | | +-------------+--------------------------+----------------+---+ +// | | | | | +// | +-------------------------------------+ | | +// | | | | v +// | +-------------v-------------+ +-----v----v---------+ +// | | | | | +// | | ColumnFamilyData 1 | | ColumnFamilyData 2 | ...... +// | | | | | +// +---> | | | +// | +---------+ | | +// | | MemTable| | | +// | | List | | | +// +--------+---+--+-+----+----+ +--------------------++ +// | | | | +// | | | | +// | | | +-----------------------+ +// | | +-----------+ | +// v +--------+ | | +// +--------+--------+ | | | +// | | | | +----------v----------+ +// +---> |SuperVersion 1.a +-----------------> | +// | +------+ | | MemTableListVersion | +// +---+-------------+ | | | | | +// | | | | +----+------------+---+ +// | current | | | | | +// | +-------------+ | |mem | | +// | | | | | | +// +-v---v-------+ +---v--v---+ +-----v----+ +----v-----+ +// | | | | | | | | +// | Version 1.a | | memtable | | memtable | | memtable | +// | | | 1.a | | 1.b | | 1.c | +// +-------------+ | | | | | | +// +----------+ +----------+ +----------+ +// +// DBImpl keeps a ColumnFamilySet, which references to all column families by +// pointing to respective ColumnFamilyData object of each column family. +// This is how DBImpl can list and operate on all the column families. +// ColumnFamilyHandle also points to ColumnFamilyData directly, so that +// when a user executes a query, it can directly find memtables and Version +// as well as SuperVersion to the column family, without going through +// ColumnFamilySet. +// +// ColumnFamilySet points to the latest view of the LSM-tree (list of memtables +// and SST files) indirectly, while ongoing operations may hold references +// to a current or an out-of-date SuperVersion, which in turn points to a +// point-in-time view of the LSM-tree. This guarantees the memtables and SST +// files being operated on will not go away, until the SuperVersion is +// unreferenced to 0 and destoryed. +// +// The following graph illustrates a possible referencing relationships: +// +// Column +--------------+ current +-----------+ +// Family +---->+ +------------------->+ | +// Data | SuperVersion +----------+ | Version A | +// | 3 | imm | | | +// Iter2 +----->+ | +-------v------+ +-----------+ +// +-----+--------+ | MemtableList +----------------> Empty +// | | Version r | +-----------+ +// | +--------------+ | | +// +------------------+ current| Version B | +// +--------------+ | +----->+ | +// | | | | +-----+-----+ +// Compaction +>+ SuperVersion +-------------+ ^ +// Job | 2 +------+ | |current +// | +----+ | | mem | +------------+ +// +--------------+ | | +---------------------> | +// | +------------------------> MemTable a | +// | mem | | | +// +--------------+ | | +------------+ +// | +--------------------------+ +// Iter1 +-----> SuperVersion | | +------------+ +// | 1 +------------------------------>+ | +// | +-+ | mem | MemTable b | +// +--------------+ | | | | +// | | +--------------+ +-----^------+ +// | |imm | MemtableList | | +// | +--->+ Version s +------------+ +// | +--------------+ +// | +--------------+ +// | | MemtableList | +// +------>+ Version t +--------> Empty +// imm +--------------+ +// +// In this example, even if the current LSM-tree consists of Version A and +// memtable a, which is also referenced by SuperVersion, two older SuperVersion +// SuperVersion2 and Superversion1 still exist, and are referenced by a +// compaction job and an old iterator Iter1, respectively. SuperVersion2 +// contains Version B, memtable a and memtable b; SuperVersion1 contains +// Version B and memtable b (mutable). As a result, Version B and memtable b +// are prevented from being destroyed or deleted. + +// ColumnFamilyHandleImpl is the class that clients use to access different +// column families. It has non-trivial destructor, which gets called when client +// is done using the column family +class ColumnFamilyHandleImpl : public ColumnFamilyHandle { + public: + // create while holding the mutex + ColumnFamilyHandleImpl( + ColumnFamilyData* cfd, DBImpl* db, InstrumentedMutex* mutex); + // destroy without mutex + virtual ~ColumnFamilyHandleImpl(); + virtual ColumnFamilyData* cfd() const { return cfd_; } + + virtual uint32_t GetID() const override; + virtual const std::string& GetName() const override; + virtual Status GetDescriptor(ColumnFamilyDescriptor* desc) override; + virtual const Comparator* GetComparator() const override; + + private: + ColumnFamilyData* cfd_; + DBImpl* db_; + InstrumentedMutex* mutex_; +}; + +// Does not ref-count ColumnFamilyData +// We use this dummy ColumnFamilyHandleImpl because sometimes MemTableInserter +// calls DBImpl methods. When this happens, MemTableInserter need access to +// ColumnFamilyHandle (same as the client would need). In that case, we feed +// MemTableInserter dummy ColumnFamilyHandle and enable it to call DBImpl +// methods +class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl { + public: + ColumnFamilyHandleInternal() + : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), internal_cfd_(nullptr) {} + + void SetCFD(ColumnFamilyData* _cfd) { internal_cfd_ = _cfd; } + virtual ColumnFamilyData* cfd() const override { return internal_cfd_; } + + private: + ColumnFamilyData* internal_cfd_; +}; + +// holds references to memtable, all immutable memtables and version +struct SuperVersion { + // Accessing members of this class is not thread-safe and requires external + // synchronization (ie db mutex held or on write thread). + ColumnFamilyData* cfd; + MemTable* mem; + MemTableListVersion* imm; + Version* current; + MutableCFOptions mutable_cf_options; + // Version number of the current SuperVersion + uint64_t version_number; + WriteStallCondition write_stall_condition; + + InstrumentedMutex* db_mutex; + + // should be called outside the mutex + SuperVersion() = default; + ~SuperVersion(); + SuperVersion* Ref(); + // If Unref() returns true, Cleanup() should be called with mutex held + // before deleting this SuperVersion. + bool Unref(); + + // call these two methods with db mutex held + // Cleanup unrefs mem, imm and current. Also, it stores all memtables + // that needs to be deleted in to_delete vector. Unrefing those + // objects needs to be done in the mutex + void Cleanup(); + void Init(ColumnFamilyData* new_cfd, MemTable* new_mem, + MemTableListVersion* new_imm, Version* new_current); + + // The value of dummy is not actually used. kSVInUse takes its address as a + // mark in the thread local storage to indicate the SuperVersion is in use + // by thread. This way, the value of kSVInUse is guaranteed to have no + // conflict with SuperVersion object address and portable on different + // platform. + static int dummy; + static void* const kSVInUse; + static void* const kSVObsolete; + + private: + std::atomic refs; + // We need to_delete because during Cleanup(), imm->Unref() returns + // all memtables that we need to free through this vector. We then + // delete all those memtables outside of mutex, during destruction + autovector to_delete; +}; + +extern Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options); + +extern Status CheckConcurrentWritesSupported( + const ColumnFamilyOptions& cf_options); + +extern Status CheckCFPathsSupported(const DBOptions& db_options, + const ColumnFamilyOptions& cf_options); + +extern ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, + const ColumnFamilyOptions& src); +// Wrap user defined table proproties collector factories `from cf_options` +// into internal ones in int_tbl_prop_collector_factories. Add a system internal +// one too. +extern void GetIntTblPropCollectorFactory( + const ImmutableCFOptions& ioptions, + std::vector>* + int_tbl_prop_collector_factories); + +class ColumnFamilySet; + +// This class keeps all the data that a column family needs. +// Most methods require DB mutex held, unless otherwise noted +class ColumnFamilyData { + public: + ~ColumnFamilyData(); + + // thread-safe + uint32_t GetID() const { return id_; } + // thread-safe + const std::string& GetName() const { return name_; } + + // Ref() can only be called from a context where the caller can guarantee + // that ColumnFamilyData is alive (while holding a non-zero ref already, + // holding a DB mutex, or as the leader in a write batch group). + void Ref() { refs_.fetch_add(1); } + + // Unref decreases the reference count, but does not handle deletion + // when the count goes to 0. If this method returns true then the + // caller should delete the instance immediately, or later, by calling + // FreeDeadColumnFamilies(). Unref() can only be called while holding + // a DB mutex, or during single-threaded recovery. + bool Unref() { + int old_refs = refs_.fetch_sub(1); + assert(old_refs > 0); + return old_refs == 1; + } + + // UnrefAndTryDelete() decreases the reference count and do free if needed, + // return true if this is freed else false, UnrefAndTryDelete() can only + // be called while holding a DB mutex, or during single-threaded recovery. + bool UnrefAndTryDelete(); + + // SetDropped() can only be called under following conditions: + // 1) Holding a DB mutex, + // 2) from single-threaded write thread, AND + // 3) from single-threaded VersionSet::LogAndApply() + // After dropping column family no other operation on that column family + // will be executed. All the files and memory will be, however, kept around + // until client drops the column family handle. That way, client can still + // access data from dropped column family. + // Column family can be dropped and still alive. In that state: + // *) Compaction and flush is not executed on the dropped column family. + // *) Client can continue reading from column family. Writes will fail unless + // WriteOptions::ignore_missing_column_families is true + // When the dropped column family is unreferenced, then we: + // *) Remove column family from the linked list maintained by ColumnFamilySet + // *) delete all memory associated with that column family + // *) delete all the files associated with that column family + void SetDropped(); + bool IsDropped() const { return dropped_.load(std::memory_order_relaxed); } + + // thread-safe + int NumberLevels() const { return ioptions_.num_levels; } + + void SetLogNumber(uint64_t log_number) { log_number_ = log_number; } + uint64_t GetLogNumber() const { return log_number_; } + + void SetFlushReason(FlushReason flush_reason) { + flush_reason_ = flush_reason; + } + FlushReason GetFlushReason() const { return flush_reason_; } + // thread-safe + const FileOptions* soptions() const; + const ImmutableCFOptions* ioptions() const { return &ioptions_; } + // REQUIRES: DB mutex held + // This returns the MutableCFOptions used by current SuperVersion + // You should use this API to reference MutableCFOptions most of the time. + const MutableCFOptions* GetCurrentMutableCFOptions() const { + return &(super_version_->mutable_cf_options); + } + // REQUIRES: DB mutex held + // This returns the latest MutableCFOptions, which may be not in effect yet. + const MutableCFOptions* GetLatestMutableCFOptions() const { + return &mutable_cf_options_; + } + + // REQUIRES: DB mutex held + // Build ColumnFamiliesOptions with immutable options and latest mutable + // options. + ColumnFamilyOptions GetLatestCFOptions() const; + + bool is_delete_range_supported() { return is_delete_range_supported_; } + + // Validate CF options against DB options + static Status ValidateOptions(const DBOptions& db_options, + const ColumnFamilyOptions& cf_options); +#ifndef ROCKSDB_LITE + // REQUIRES: DB mutex held + Status SetOptions( + const DBOptions& db_options, + const std::unordered_map& options_map); +#endif // ROCKSDB_LITE + + InternalStats* internal_stats() { return internal_stats_.get(); } + + MemTableList* imm() { return &imm_; } + MemTable* mem() { return mem_; } + Version* current() { return current_; } + Version* dummy_versions() { return dummy_versions_; } + void SetCurrent(Version* _current); + uint64_t GetNumLiveVersions() const; // REQUIRE: DB mutex held + uint64_t GetTotalSstFilesSize() const; // REQUIRE: DB mutex held + uint64_t GetLiveSstFilesSize() const; // REQUIRE: DB mutex held + void SetMemtable(MemTable* new_mem) { + uint64_t memtable_id = last_memtable_id_.fetch_add(1) + 1; + new_mem->SetID(memtable_id); + mem_ = new_mem; + } + + // calculate the oldest log needed for the durability of this column family + uint64_t OldestLogToKeep(); + + // See Memtable constructor for explanation of earliest_seq param. + MemTable* ConstructNewMemtable(const MutableCFOptions& mutable_cf_options, + SequenceNumber earliest_seq); + void CreateNewMemtable(const MutableCFOptions& mutable_cf_options, + SequenceNumber earliest_seq); + + TableCache* table_cache() const { return table_cache_.get(); } + + // See documentation in compaction_picker.h + // REQUIRES: DB mutex held + bool NeedsCompaction() const; + // REQUIRES: DB mutex held + Compaction* PickCompaction(const MutableCFOptions& mutable_options, + LogBuffer* log_buffer); + + // Check if the passed range overlap with any running compactions. + // REQUIRES: DB mutex held + bool RangeOverlapWithCompaction(const Slice& smallest_user_key, + const Slice& largest_user_key, + int level) const; + + // Check if the passed ranges overlap with any unflushed memtables + // (immutable or mutable). + // + // @param super_version A referenced SuperVersion that will be held for the + // duration of this function. + // + // Thread-safe + Status RangesOverlapWithMemtables(const autovector& ranges, + SuperVersion* super_version, bool* overlap); + + // A flag to tell a manual compaction is to compact all levels together + // instead of a specific level. + static const int kCompactAllLevels; + // A flag to tell a manual compaction's output is base level. + static const int kCompactToBaseLevel; + // REQUIRES: DB mutex held + Compaction* CompactRange(const MutableCFOptions& mutable_cf_options, + int input_level, int output_level, + const CompactRangeOptions& compact_range_options, + const InternalKey* begin, const InternalKey* end, + InternalKey** compaction_end, bool* manual_conflict, + uint64_t max_file_num_to_ignore); + + CompactionPicker* compaction_picker() { return compaction_picker_.get(); } + // thread-safe + const Comparator* user_comparator() const { + return internal_comparator_.user_comparator(); + } + // thread-safe + const InternalKeyComparator& internal_comparator() const { + return internal_comparator_; + } + + const std::vector>* + int_tbl_prop_collector_factories() const { + return &int_tbl_prop_collector_factories_; + } + + SuperVersion* GetSuperVersion() { return super_version_; } + // thread-safe + // Return a already referenced SuperVersion to be used safely. + SuperVersion* GetReferencedSuperVersion(DBImpl* db); + // thread-safe + // Get SuperVersion stored in thread local storage. If it does not exist, + // get a reference from a current SuperVersion. + SuperVersion* GetThreadLocalSuperVersion(DBImpl* db); + // Try to return SuperVersion back to thread local storage. Retrun true on + // success and false on failure. It fails when the thread local storage + // contains anything other than SuperVersion::kSVInUse flag. + bool ReturnThreadLocalSuperVersion(SuperVersion* sv); + // thread-safe + uint64_t GetSuperVersionNumber() const { + return super_version_number_.load(); + } + // will return a pointer to SuperVersion* if previous SuperVersion + // if its reference count is zero and needs deletion or nullptr if not + // As argument takes a pointer to allocated SuperVersion to enable + // the clients to allocate SuperVersion outside of mutex. + // IMPORTANT: Only call this from DBImpl::InstallSuperVersion() + void InstallSuperVersion(SuperVersionContext* sv_context, + InstrumentedMutex* db_mutex, + const MutableCFOptions& mutable_cf_options); + void InstallSuperVersion(SuperVersionContext* sv_context, + InstrumentedMutex* db_mutex); + + void ResetThreadLocalSuperVersions(); + + // Protected by DB mutex + void set_queued_for_flush(bool value) { queued_for_flush_ = value; } + void set_queued_for_compaction(bool value) { queued_for_compaction_ = value; } + bool queued_for_flush() { return queued_for_flush_; } + bool queued_for_compaction() { return queued_for_compaction_; } + + enum class WriteStallCause { + kNone, + kMemtableLimit, + kL0FileCountLimit, + kPendingCompactionBytes, + }; + static std::pair + GetWriteStallConditionAndCause(int num_unflushed_memtables, int num_l0_files, + uint64_t num_compaction_needed_bytes, + const MutableCFOptions& mutable_cf_options); + + // Recalculate some small conditions, which are changed only during + // compaction, adding new memtable and/or + // recalculation of compaction score. These values are used in + // DBImpl::MakeRoomForWrite function to decide, if it need to make + // a write stall + WriteStallCondition RecalculateWriteStallConditions( + const MutableCFOptions& mutable_cf_options); + + void set_initialized() { initialized_.store(true); } + + bool initialized() const { return initialized_.load(); } + + const ColumnFamilyOptions& initial_cf_options() { + return initial_cf_options_; + } + + Env::WriteLifeTimeHint CalculateSSTWriteHint(int level); + + // created_dirs remembers directory created, so that we don't need to call + // the same data creation operation again. + Status AddDirectories( + std::map>* created_dirs); + + Directory* GetDataDir(size_t path_id) const; + + ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); } + + private: + friend class ColumnFamilySet; + ColumnFamilyData(uint32_t id, const std::string& name, + Version* dummy_versions, Cache* table_cache, + WriteBufferManager* write_buffer_manager, + const ColumnFamilyOptions& options, + const ImmutableDBOptions& db_options, + const FileOptions& file_options, + ColumnFamilySet* column_family_set, + BlockCacheTracer* const block_cache_tracer); + + uint32_t id_; + const std::string name_; + Version* dummy_versions_; // Head of circular doubly-linked list of versions. + Version* current_; // == dummy_versions->prev_ + + std::atomic refs_; // outstanding references to ColumnFamilyData + std::atomic initialized_; + std::atomic dropped_; // true if client dropped it + + const InternalKeyComparator internal_comparator_; + std::vector> + int_tbl_prop_collector_factories_; + + const ColumnFamilyOptions initial_cf_options_; + const ImmutableCFOptions ioptions_; + MutableCFOptions mutable_cf_options_; + + const bool is_delete_range_supported_; + + std::unique_ptr table_cache_; + + std::unique_ptr internal_stats_; + + WriteBufferManager* write_buffer_manager_; + + MemTable* mem_; + MemTableList imm_; + SuperVersion* super_version_; + + // An ordinal representing the current SuperVersion. Updated by + // InstallSuperVersion(), i.e. incremented every time super_version_ + // changes. + std::atomic super_version_number_; + + // Thread's local copy of SuperVersion pointer + // This needs to be destructed before mutex_ + std::unique_ptr local_sv_; + + // pointers for a circular linked list. we use it to support iterations over + // all column families that are alive (note: dropped column families can also + // be alive as long as client holds a reference) + ColumnFamilyData* next_; + ColumnFamilyData* prev_; + + // This is the earliest log file number that contains data from this + // Column Family. All earlier log files must be ignored and not + // recovered from + uint64_t log_number_; + + std::atomic flush_reason_; + + // An object that keeps all the compaction stats + // and picks the next compaction + std::unique_ptr compaction_picker_; + + ColumnFamilySet* column_family_set_; + + std::unique_ptr write_controller_token_; + + // If true --> this ColumnFamily is currently present in DBImpl::flush_queue_ + bool queued_for_flush_; + + // If true --> this ColumnFamily is currently present in + // DBImpl::compaction_queue_ + bool queued_for_compaction_; + + uint64_t prev_compaction_needed_bytes_; + + // if the database was opened with 2pc enabled + bool allow_2pc_; + + // Memtable id to track flush. + std::atomic last_memtable_id_; + + // Directories corresponding to cf_paths. + std::vector> data_dirs_; +}; + +// ColumnFamilySet has interesting thread-safety requirements +// * CreateColumnFamily() or RemoveColumnFamily() -- need to be protected by DB +// mutex AND executed in the write thread. +// CreateColumnFamily() should ONLY be called from VersionSet::LogAndApply() AND +// single-threaded write thread. It is also called during Recovery and in +// DumpManifest(). +// RemoveColumnFamily() is only called from SetDropped(). DB mutex needs to be +// held and it needs to be executed from the write thread. SetDropped() also +// guarantees that it will be called only from single-threaded LogAndApply(), +// but this condition is not that important. +// * Iteration -- hold DB mutex, but you can release it in the body of +// iteration. If you release DB mutex in body, reference the column +// family before the mutex and unreference after you unlock, since the column +// family might get dropped when the DB mutex is released +// * GetDefault() -- thread safe +// * GetColumnFamily() -- either inside of DB mutex or from a write thread +// * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(), +// NumberOfColumnFamilies -- inside of DB mutex +class ColumnFamilySet { + public: + // ColumnFamilySet supports iteration + class iterator { + public: + explicit iterator(ColumnFamilyData* cfd) + : current_(cfd) {} + iterator& operator++() { + // dropped column families might still be included in this iteration + // (we're only removing them when client drops the last reference to the + // column family). + // dummy is never dead, so this will never be infinite + do { + current_ = current_->next_; + } while (current_->refs_.load(std::memory_order_relaxed) == 0); + return *this; + } + bool operator!=(const iterator& other) { + return this->current_ != other.current_; + } + ColumnFamilyData* operator*() { return current_; } + + private: + ColumnFamilyData* current_; + }; + + ColumnFamilySet(const std::string& dbname, + const ImmutableDBOptions* db_options, + const FileOptions& file_options, Cache* table_cache, + WriteBufferManager* write_buffer_manager, + WriteController* write_controller, + BlockCacheTracer* const block_cache_tracer); + ~ColumnFamilySet(); + + ColumnFamilyData* GetDefault() const; + // GetColumnFamily() calls return nullptr if column family is not found + ColumnFamilyData* GetColumnFamily(uint32_t id) const; + ColumnFamilyData* GetColumnFamily(const std::string& name) const; + // this call will return the next available column family ID. it guarantees + // that there is no column family with id greater than or equal to the + // returned value in the current running instance or anytime in RocksDB + // instance history. + uint32_t GetNextColumnFamilyID(); + uint32_t GetMaxColumnFamily(); + void UpdateMaxColumnFamily(uint32_t new_max_column_family); + size_t NumberOfColumnFamilies() const; + + ColumnFamilyData* CreateColumnFamily(const std::string& name, uint32_t id, + Version* dummy_version, + const ColumnFamilyOptions& options); + + iterator begin() { return iterator(dummy_cfd_->next_); } + iterator end() { return iterator(dummy_cfd_); } + + // REQUIRES: DB mutex held + // Don't call while iterating over ColumnFamilySet + void FreeDeadColumnFamilies(); + + Cache* get_table_cache() { return table_cache_; } + + private: + friend class ColumnFamilyData; + // helper function that gets called from cfd destructor + // REQUIRES: DB mutex held + void RemoveColumnFamily(ColumnFamilyData* cfd); + + // column_families_ and column_family_data_ need to be protected: + // * when mutating both conditions have to be satisfied: + // 1. DB mutex locked + // 2. thread currently in single-threaded write thread + // * when reading, at least one condition needs to be satisfied: + // 1. DB mutex locked + // 2. accessed from a single-threaded write thread + std::unordered_map column_families_; + std::unordered_map column_family_data_; + + uint32_t max_column_family_; + ColumnFamilyData* dummy_cfd_; + // We don't hold the refcount here, since default column family always exists + // We are also not responsible for cleaning up default_cfd_cache_. This is + // just a cache that makes common case (accessing default column family) + // faster + ColumnFamilyData* default_cfd_cache_; + + const std::string db_name_; + const ImmutableDBOptions* const db_options_; + const FileOptions file_options_; + Cache* table_cache_; + WriteBufferManager* write_buffer_manager_; + WriteController* write_controller_; + BlockCacheTracer* const block_cache_tracer_; +}; + +// We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access +// memtables of different column families (specified by ID in the write batch) +class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables { + public: + explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set) + : column_family_set_(column_family_set), current_(nullptr) {} + + // Constructs a ColumnFamilyMemTablesImpl equivalent to one constructed + // with the arguments used to construct *orig. + explicit ColumnFamilyMemTablesImpl(ColumnFamilyMemTablesImpl* orig) + : column_family_set_(orig->column_family_set_), current_(nullptr) {} + + // sets current_ to ColumnFamilyData with column_family_id + // returns false if column family doesn't exist + // REQUIRES: use this function of DBImpl::column_family_memtables_ should be + // under a DB mutex OR from a write thread + bool Seek(uint32_t column_family_id) override; + + // Returns log number of the selected column family + // REQUIRES: under a DB mutex OR from a write thread + uint64_t GetLogNumber() const override; + + // REQUIRES: Seek() called first + // REQUIRES: use this function of DBImpl::column_family_memtables_ should be + // under a DB mutex OR from a write thread + virtual MemTable* GetMemTable() const override; + + // Returns column family handle for the selected column family + // REQUIRES: use this function of DBImpl::column_family_memtables_ should be + // under a DB mutex OR from a write thread + virtual ColumnFamilyHandle* GetColumnFamilyHandle() override; + + // Cannot be called while another thread is calling Seek(). + // REQUIRES: use this function of DBImpl::column_family_memtables_ should be + // under a DB mutex OR from a write thread + virtual ColumnFamilyData* current() override { return current_; } + + private: + ColumnFamilySet* column_family_set_; + ColumnFamilyData* current_; + ColumnFamilyHandleInternal handle_; +}; + +extern uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family); + +extern const Comparator* GetColumnFamilyUserComparator( + ColumnFamilyHandle* column_family); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/column_family_test.cc b/src/rocksdb/db/column_family_test.cc new file mode 100644 index 000000000..24ff4e08b --- /dev/null +++ b/src/rocksdb/db/column_family_test.cc @@ -0,0 +1,3387 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" +#include "memtable/hash_skiplist_rep.h" +#include "options/options_parser.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/utilities/object_registry.h" +#include "test_util/fault_injection_test_env.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/coding.h" +#include "util/string_util.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { + +static const int kValueSize = 1000; + +namespace { +std::string RandomString(Random* rnd, int len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; +} +} // anonymous namespace + +// counts how many operations were performed +class EnvCounter : public EnvWrapper { + public: + explicit EnvCounter(Env* base) + : EnvWrapper(base), num_new_writable_file_(0) {} + int GetNumberOfNewWritableFileCalls() { + return num_new_writable_file_; + } + Status NewWritableFile(const std::string& f, std::unique_ptr* r, + const EnvOptions& soptions) override { + ++num_new_writable_file_; + return EnvWrapper::NewWritableFile(f, r, soptions); + } + + private: + std::atomic num_new_writable_file_; +}; + +class ColumnFamilyTestBase : public testing::Test { + public: + explicit ColumnFamilyTestBase(uint32_t format) : rnd_(139), format_(format) { + Env* base_env = Env::Default(); +#ifndef ROCKSDB_LITE + const char* test_env_uri = getenv("TEST_ENV_URI"); + if (test_env_uri) { + Env* test_env = nullptr; + Status s = Env::LoadEnv(test_env_uri, &test_env, &env_guard_); + base_env = test_env; + EXPECT_OK(s); + EXPECT_NE(Env::Default(), base_env); + } +#endif // !ROCKSDB_LITE + EXPECT_NE(nullptr, base_env); + env_ = new EnvCounter(base_env); + dbname_ = test::PerThreadDBPath("column_family_test"); + db_options_.create_if_missing = true; + db_options_.fail_if_options_file_error = true; + db_options_.env = env_; + DestroyDB(dbname_, Options(db_options_, column_family_options_)); + } + + ~ColumnFamilyTestBase() override { + std::vector column_families; + for (auto h : handles_) { + ColumnFamilyDescriptor cfdescriptor; + h->GetDescriptor(&cfdescriptor); + column_families.push_back(cfdescriptor); + } + Close(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + Destroy(column_families); + delete env_; + } + + BlockBasedTableOptions GetBlockBasedTableOptions() { + BlockBasedTableOptions options; + options.format_version = format_; + return options; + } + + // Return the value to associate with the specified key + Slice Value(int k, std::string* storage) { + if (k == 0) { + // Ugh. Random seed of 0 used to produce no entropy. This code + // preserves the implementation that was in place when all of the + // magic values in this file were picked. + *storage = std::string(kValueSize, ' '); + return Slice(*storage); + } else { + Random r(k); + return test::RandomString(&r, kValueSize, storage); + } + } + + void Build(int base, int n, int flush_every = 0) { + std::string key_space, value_space; + WriteBatch batch; + + for (int i = 0; i < n; i++) { + if (flush_every != 0 && i != 0 && i % flush_every == 0) { + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_FlushMemTable(); + } + + int keyi = base + i; + Slice key(DBTestBase::Key(keyi)); + + batch.Clear(); + batch.Put(handles_[0], key, Value(keyi, &value_space)); + batch.Put(handles_[1], key, Value(keyi, &value_space)); + batch.Put(handles_[2], key, Value(keyi, &value_space)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + } + } + + void CheckMissed() { + uint64_t next_expected = 0; + uint64_t missed = 0; + int bad_keys = 0; + int bad_values = 0; + int correct = 0; + std::string value_space; + for (int cf = 0; cf < 3; cf++) { + next_expected = 0; + Iterator* iter = db_->NewIterator(ReadOptions(false, true), handles_[cf]); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + uint64_t key; + Slice in(iter->key()); + in.remove_prefix(3); + if (!ConsumeDecimalNumber(&in, &key) || !in.empty() || + key < next_expected) { + bad_keys++; + continue; + } + missed += (key - next_expected); + next_expected = key + 1; + if (iter->value() != Value(static_cast(key), &value_space)) { + bad_values++; + } else { + correct++; + } + } + delete iter; + } + + ASSERT_EQ(0, bad_keys); + ASSERT_EQ(0, bad_values); + ASSERT_EQ(0, missed); + (void)correct; + } + + void Close() { + for (auto h : handles_) { + if (h) { + db_->DestroyColumnFamilyHandle(h); + } + } + handles_.clear(); + names_.clear(); + delete db_; + db_ = nullptr; + } + + Status TryOpen(std::vector cf, + std::vector options = {}) { + std::vector column_families; + names_.clear(); + for (size_t i = 0; i < cf.size(); ++i) { + column_families.push_back(ColumnFamilyDescriptor( + cf[i], options.size() == 0 ? column_family_options_ : options[i])); + names_.push_back(cf[i]); + } + return DB::Open(db_options_, dbname_, column_families, &handles_, &db_); + } + + Status OpenReadOnly(std::vector cf, + std::vector options = {}) { + std::vector column_families; + names_.clear(); + for (size_t i = 0; i < cf.size(); ++i) { + column_families.push_back(ColumnFamilyDescriptor( + cf[i], options.size() == 0 ? column_family_options_ : options[i])); + names_.push_back(cf[i]); + } + return DB::OpenForReadOnly(db_options_, dbname_, column_families, &handles_, + &db_); + } + +#ifndef ROCKSDB_LITE // ReadOnlyDB is not supported + void AssertOpenReadOnly(std::vector cf, + std::vector options = {}) { + ASSERT_OK(OpenReadOnly(cf, options)); + } +#endif // !ROCKSDB_LITE + + + void Open(std::vector cf, + std::vector options = {}) { + ASSERT_OK(TryOpen(cf, options)); + } + + void Open() { + Open({"default"}); + } + + DBImpl* dbfull() { return reinterpret_cast(db_); } + + int GetProperty(int cf, std::string property) { + std::string value; + EXPECT_TRUE(dbfull()->GetProperty(handles_[cf], property, &value)); +#ifndef CYGWIN + return std::stoi(value); +#else + return std::strtol(value.c_str(), 0 /* off */, 10 /* base */); +#endif + } + + bool IsDbWriteStopped() { +#ifndef ROCKSDB_LITE + uint64_t v; + EXPECT_TRUE(dbfull()->GetIntProperty("rocksdb.is-write-stopped", &v)); + return (v == 1); +#else + return dbfull()->TEST_write_controler().IsStopped(); +#endif // !ROCKSDB_LITE + } + + uint64_t GetDbDelayedWriteRate() { +#ifndef ROCKSDB_LITE + uint64_t v; + EXPECT_TRUE( + dbfull()->GetIntProperty("rocksdb.actual-delayed-write-rate", &v)); + return v; +#else + if (!dbfull()->TEST_write_controler().NeedsDelay()) { + return 0; + } + return dbfull()->TEST_write_controler().delayed_write_rate(); +#endif // !ROCKSDB_LITE + } + + void Destroy(const std::vector& column_families = + std::vector()) { + Close(); + ASSERT_OK(DestroyDB(dbname_, Options(db_options_, column_family_options_), + column_families)); + } + + void CreateColumnFamilies( + const std::vector& cfs, + const std::vector options = {}) { + int cfi = static_cast(handles_.size()); + handles_.resize(cfi + cfs.size()); + names_.resize(cfi + cfs.size()); + for (size_t i = 0; i < cfs.size(); ++i) { + const auto& current_cf_opt = + options.size() == 0 ? column_family_options_ : options[i]; + ASSERT_OK( + db_->CreateColumnFamily(current_cf_opt, cfs[i], &handles_[cfi])); + names_[cfi] = cfs[i]; + +#ifndef ROCKSDB_LITE // RocksDBLite does not support GetDescriptor + // Verify the CF options of the returned CF handle. + ColumnFamilyDescriptor desc; + ASSERT_OK(handles_[cfi]->GetDescriptor(&desc)); + RocksDBOptionsParser::VerifyCFOptions(desc.options, current_cf_opt); +#endif // !ROCKSDB_LITE + cfi++; + } + } + + void Reopen(const std::vector options = {}) { + std::vector names; + for (auto name : names_) { + if (name != "") { + names.push_back(name); + } + } + Close(); + assert(options.size() == 0 || names.size() == options.size()); + Open(names, options); + } + + void CreateColumnFamiliesAndReopen(const std::vector& cfs) { + CreateColumnFamilies(cfs); + Reopen(); + } + + void DropColumnFamilies(const std::vector& cfs) { + for (auto cf : cfs) { + ASSERT_OK(db_->DropColumnFamily(handles_[cf])); + db_->DestroyColumnFamilyHandle(handles_[cf]); + handles_[cf] = nullptr; + names_[cf] = ""; + } + } + + void PutRandomData(int cf, int num, int key_value_size, bool save = false) { + if (cf >= static_cast(keys_.size())) { + keys_.resize(cf + 1); + } + for (int i = 0; i < num; ++i) { + // 10 bytes for key, rest is value + if (!save) { + ASSERT_OK(Put(cf, test::RandomKey(&rnd_, 11), + RandomString(&rnd_, key_value_size - 10))); + } else { + std::string key = test::RandomKey(&rnd_, 11); + keys_[cf].insert(key); + ASSERT_OK(Put(cf, key, RandomString(&rnd_, key_value_size - 10))); + } + } + db_->FlushWAL(false); + } + +#ifndef ROCKSDB_LITE // TEST functions in DB are not supported in lite + void WaitForFlush(int cf) { + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf])); + } + + void WaitForCompaction() { + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + + uint64_t MaxTotalInMemoryState() { + return dbfull()->TEST_MaxTotalInMemoryState(); + } + + void AssertMaxTotalInMemoryState(uint64_t value) { + ASSERT_EQ(value, MaxTotalInMemoryState()); + } +#endif // !ROCKSDB_LITE + + Status Put(int cf, const std::string& key, const std::string& value) { + return db_->Put(WriteOptions(), handles_[cf], Slice(key), Slice(value)); + } + Status Merge(int cf, const std::string& key, const std::string& value) { + return db_->Merge(WriteOptions(), handles_[cf], Slice(key), Slice(value)); + } + Status Flush(int cf) { + return db_->Flush(FlushOptions(), handles_[cf]); + } + + std::string Get(int cf, const std::string& key) { + ReadOptions options; + options.verify_checksums = true; + std::string result; + Status s = db_->Get(options, handles_[cf], Slice(key), &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + void CompactAll(int cf) { + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[cf], nullptr, + nullptr)); + } + + void Compact(int cf, const Slice& start, const Slice& limit) { + ASSERT_OK( + db_->CompactRange(CompactRangeOptions(), handles_[cf], &start, &limit)); + } + + int NumTableFilesAtLevel(int level, int cf) { + return GetProperty(cf, + "rocksdb.num-files-at-level" + ToString(level)); + } + +#ifndef ROCKSDB_LITE + // Return spread of files per level + std::string FilesPerLevel(int cf) { + std::string result; + int last_non_zero_offset = 0; + for (int level = 0; level < dbfull()->NumberLevels(handles_[cf]); level++) { + int f = NumTableFilesAtLevel(level, cf); + char buf[100]; + snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f); + result += buf; + if (f > 0) { + last_non_zero_offset = static_cast(result.size()); + } + } + result.resize(last_non_zero_offset); + return result; + } +#endif + + void AssertFilesPerLevel(const std::string& value, int cf) { +#ifndef ROCKSDB_LITE + ASSERT_EQ(value, FilesPerLevel(cf)); +#else + (void) value; + (void) cf; +#endif + } + +#ifndef ROCKSDB_LITE // GetLiveFilesMetaData is not supported + int CountLiveFiles() { + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + return static_cast(metadata.size()); + } +#endif // !ROCKSDB_LITE + + void AssertCountLiveFiles(int expected_value) { +#ifndef ROCKSDB_LITE + ASSERT_EQ(expected_value, CountLiveFiles()); +#else + (void) expected_value; +#endif + } + + // Do n memtable flushes, each of which produces an sstable + // covering the range [small,large]. + void MakeTables(int cf, int n, const std::string& small, + const std::string& large) { + for (int i = 0; i < n; i++) { + ASSERT_OK(Put(cf, small, "begin")); + ASSERT_OK(Put(cf, large, "end")); + ASSERT_OK(db_->Flush(FlushOptions(), handles_[cf])); + } + } + +#ifndef ROCKSDB_LITE // GetSortedWalFiles is not supported + int CountLiveLogFiles() { + int micros_wait_for_log_deletion = 20000; + env_->SleepForMicroseconds(micros_wait_for_log_deletion); + int ret = 0; + VectorLogPtr wal_files; + Status s; + // GetSortedWalFiles is a flakey function -- it gets all the wal_dir + // children files and then later checks for their existence. if some of the + // log files doesn't exist anymore, it reports an error. it does all of this + // without DB mutex held, so if a background process deletes the log file + // while the function is being executed, it returns an error. We retry the + // function 10 times to avoid the error failing the test + for (int retries = 0; retries < 10; ++retries) { + wal_files.clear(); + s = db_->GetSortedWalFiles(wal_files); + if (s.ok()) { + break; + } + } + EXPECT_OK(s); + for (const auto& wal : wal_files) { + if (wal->Type() == kAliveLogFile) { + ++ret; + } + } + return ret; + return 0; + } +#endif // !ROCKSDB_LITE + + void AssertCountLiveLogFiles(int value) { +#ifndef ROCKSDB_LITE // GetSortedWalFiles is not supported + ASSERT_EQ(value, CountLiveLogFiles()); +#else + (void) value; +#endif // !ROCKSDB_LITE + } + + void AssertNumberOfImmutableMemtables(std::vector num_per_cf) { + assert(num_per_cf.size() == handles_.size()); + +#ifndef ROCKSDB_LITE // GetProperty is not supported in lite + for (size_t i = 0; i < num_per_cf.size(); ++i) { + ASSERT_EQ(num_per_cf[i], GetProperty(static_cast(i), + "rocksdb.num-immutable-mem-table")); + } +#endif // !ROCKSDB_LITE + } + + void CopyFile(const std::string& source, const std::string& destination, + uint64_t size = 0) { + const EnvOptions soptions; + std::unique_ptr srcfile; + ASSERT_OK(env_->NewSequentialFile(source, &srcfile, soptions)); + std::unique_ptr destfile; + ASSERT_OK(env_->NewWritableFile(destination, &destfile, soptions)); + + if (size == 0) { + // default argument means copy everything + ASSERT_OK(env_->GetFileSize(source, &size)); + } + + char buffer[4096]; + Slice slice; + while (size > 0) { + uint64_t one = std::min(uint64_t(sizeof(buffer)), size); + ASSERT_OK(srcfile->Read(one, &slice, buffer)); + ASSERT_OK(destfile->Append(slice)); + size -= slice.size(); + } + ASSERT_OK(destfile->Close()); + } + + int GetSstFileCount(std::string path) { + std::vector files; + DBTestBase::GetSstFiles(env_, path, &files); + return static_cast(files.size()); + } + + void RecalculateWriteStallConditions(ColumnFamilyData* cfd, + const MutableCFOptions& mutable_cf_options) { + // add lock to avoid race condition between + // `RecalculateWriteStallConditions` which writes to CFStats and + // background `DBImpl::DumpStats()` threads which read CFStats + dbfull()->TEST_LockMutex(); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + dbfull()-> TEST_UnlockMutex(); + } + + std::vector handles_; + std::vector names_; + std::vector> keys_; + ColumnFamilyOptions column_family_options_; + DBOptions db_options_; + std::string dbname_; + DB* db_ = nullptr; + EnvCounter* env_; + std::shared_ptr env_guard_; + Random rnd_; + uint32_t format_; +}; + +class ColumnFamilyTest + : public ColumnFamilyTestBase, + virtual public ::testing::WithParamInterface { + public: + ColumnFamilyTest() : ColumnFamilyTestBase(GetParam()) {} +}; + +INSTANTIATE_TEST_CASE_P(FormatDef, ColumnFamilyTest, + testing::Values(test::kDefaultFormatVersion)); +INSTANTIATE_TEST_CASE_P(FormatLatest, ColumnFamilyTest, + testing::Values(test::kLatestFormatVersion)); + +TEST_P(ColumnFamilyTest, DontReuseColumnFamilyID) { + for (int iter = 0; iter < 3; ++iter) { + Open(); + CreateColumnFamilies({"one", "two", "three"}); + for (size_t i = 0; i < handles_.size(); ++i) { + auto cfh = reinterpret_cast(handles_[i]); + ASSERT_EQ(i, cfh->GetID()); + } + if (iter == 1) { + Reopen(); + } + DropColumnFamilies({3}); + Reopen(); + if (iter == 2) { + // this tests if max_column_family is correctly persisted with + // WriteSnapshot() + Reopen(); + } + CreateColumnFamilies({"three2"}); + // ID 3 that was used for dropped column family "three" should not be + // reused + auto cfh3 = reinterpret_cast(handles_[3]); + ASSERT_EQ(4U, cfh3->GetID()); + Close(); + Destroy(); + } +} + +#ifndef ROCKSDB_LITE +TEST_P(ColumnFamilyTest, CreateCFRaceWithGetAggProperty) { + Open(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::WriteOptionsFile:1", + "ColumnFamilyTest.CreateCFRaceWithGetAggProperty:1"}, + {"ColumnFamilyTest.CreateCFRaceWithGetAggProperty:2", + "DBImpl::WriteOptionsFile:2"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::port::Thread thread( + [&] { CreateColumnFamilies({"one"}); }); + + TEST_SYNC_POINT("ColumnFamilyTest.CreateCFRaceWithGetAggProperty:1"); + uint64_t pv; + db_->GetAggregatedIntProperty(DB::Properties::kEstimateTableReadersMem, &pv); + TEST_SYNC_POINT("ColumnFamilyTest.CreateCFRaceWithGetAggProperty:2"); + + thread.join(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} +#endif // !ROCKSDB_LITE + +class FlushEmptyCFTestWithParam + : public ColumnFamilyTestBase, + virtual public testing::WithParamInterface> { + public: + FlushEmptyCFTestWithParam() + : ColumnFamilyTestBase(std::get<0>(GetParam())), + allow_2pc_(std::get<1>(GetParam())) {} + + // Required if inheriting from testing::WithParamInterface<> + static void SetUpTestCase() {} + static void TearDownTestCase() {} + + bool allow_2pc_; +}; + +TEST_P(FlushEmptyCFTestWithParam, FlushEmptyCFTest) { + std::unique_ptr fault_env( + new FaultInjectionTestEnv(env_)); + db_options_.env = fault_env.get(); + db_options_.allow_2pc = allow_2pc_; + Open(); + CreateColumnFamilies({"one", "two"}); + // Generate log file A. + ASSERT_OK(Put(1, "foo", "v1")); // seqID 1 + + Reopen(); + // Log file A is not dropped after reopening because default column family's + // min log number is 0. + // It flushes to SST file X + ASSERT_OK(Put(1, "foo", "v1")); // seqID 2 + ASSERT_OK(Put(1, "bar", "v2")); // seqID 3 + // Current log file is file B now. While flushing, a new log file C is created + // and is set to current. Boths' min log number is set to file C in memory, so + // after flushing file B is deleted. At the same time, the min log number of + // default CF is not written to manifest. Log file A still remains. + // Flushed to SST file Y. + Flush(1); + Flush(0); + ASSERT_OK(Put(1, "bar", "v3")); // seqID 4 + ASSERT_OK(Put(1, "foo", "v4")); // seqID 5 + db_->FlushWAL(false); + + // Preserve file system state up to here to simulate a crash condition. + fault_env->SetFilesystemActive(false); + std::vector names; + for (auto name : names_) { + if (name != "") { + names.push_back(name); + } + } + + Close(); + fault_env->ResetState(); + + // Before opening, there are four files: + // Log file A contains seqID 1 + // Log file C contains seqID 4, 5 + // SST file X contains seqID 1 + // SST file Y contains seqID 2, 3 + // Min log number: + // default CF: 0 + // CF one, two: C + // When opening the DB, all the seqID should be preserved. + Open(names, {}); + ASSERT_EQ("v4", Get(1, "foo")); + ASSERT_EQ("v3", Get(1, "bar")); + Close(); + + db_options_.env = env_; +} + +TEST_P(FlushEmptyCFTestWithParam, FlushEmptyCFTest2) { + std::unique_ptr fault_env( + new FaultInjectionTestEnv(env_)); + db_options_.env = fault_env.get(); + db_options_.allow_2pc = allow_2pc_; + Open(); + CreateColumnFamilies({"one", "two"}); + // Generate log file A. + ASSERT_OK(Put(1, "foo", "v1")); // seqID 1 + + Reopen(); + // Log file A is not dropped after reopening because default column family's + // min log number is 0. + // It flushes to SST file X + ASSERT_OK(Put(1, "foo", "v1")); // seqID 2 + ASSERT_OK(Put(1, "bar", "v2")); // seqID 3 + // Current log file is file B now. While flushing, a new log file C is created + // and is set to current. Both CFs' min log number is set to file C so after + // flushing file B is deleted. Log file A still remains. + // Flushed to SST file Y. + Flush(1); + ASSERT_OK(Put(0, "bar", "v2")); // seqID 4 + ASSERT_OK(Put(2, "bar", "v2")); // seqID 5 + ASSERT_OK(Put(1, "bar", "v3")); // seqID 6 + // Flushing all column families. This forces all CFs' min log to current. This + // is written to the manifest file. Log file C is cleared. + Flush(0); + Flush(1); + Flush(2); + // Write to log file D + ASSERT_OK(Put(1, "bar", "v4")); // seqID 7 + ASSERT_OK(Put(1, "bar", "v5")); // seqID 8 + db_->FlushWAL(false); + // Preserve file system state up to here to simulate a crash condition. + fault_env->SetFilesystemActive(false); + std::vector names; + for (auto name : names_) { + if (name != "") { + names.push_back(name); + } + } + + Close(); + fault_env->ResetState(); + // Before opening, there are two logfiles: + // Log file A contains seqID 1 + // Log file D contains seqID 7, 8 + // Min log number: + // default CF: D + // CF one, two: D + // When opening the DB, log file D should be replayed using the seqID + // specified in the file. + Open(names, {}); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v5", Get(1, "bar")); + Close(); + + db_options_.env = env_; +} + +INSTANTIATE_TEST_CASE_P( + FormatDef, FlushEmptyCFTestWithParam, + testing::Values(std::make_tuple(test::kDefaultFormatVersion, true), + std::make_tuple(test::kDefaultFormatVersion, false))); +INSTANTIATE_TEST_CASE_P( + FormatLatest, FlushEmptyCFTestWithParam, + testing::Values(std::make_tuple(test::kLatestFormatVersion, true), + std::make_tuple(test::kLatestFormatVersion, false))); + +TEST_P(ColumnFamilyTest, AddDrop) { + Open(); + CreateColumnFamilies({"one", "two", "three"}); + ASSERT_EQ("NOT_FOUND", Get(1, "fodor")); + ASSERT_EQ("NOT_FOUND", Get(2, "fodor")); + DropColumnFamilies({2}); + ASSERT_EQ("NOT_FOUND", Get(1, "fodor")); + CreateColumnFamilies({"four"}); + ASSERT_EQ("NOT_FOUND", Get(3, "fodor")); + ASSERT_OK(Put(1, "fodor", "mirko")); + ASSERT_EQ("mirko", Get(1, "fodor")); + ASSERT_EQ("NOT_FOUND", Get(3, "fodor")); + Close(); + ASSERT_TRUE(TryOpen({"default"}).IsInvalidArgument()); + Open({"default", "one", "three", "four"}); + DropColumnFamilies({1}); + Reopen(); + Close(); + + std::vector families; + ASSERT_OK(DB::ListColumnFamilies(db_options_, dbname_, &families)); + std::sort(families.begin(), families.end()); + ASSERT_TRUE(families == + std::vector({"default", "four", "three"})); +} + +TEST_P(ColumnFamilyTest, BulkAddDrop) { + constexpr int kNumCF = 1000; + ColumnFamilyOptions cf_options; + WriteOptions write_options; + Open(); + std::vector cf_names; + std::vector cf_handles; + for (int i = 1; i <= kNumCF; i++) { + cf_names.push_back("cf1-" + ToString(i)); + } + ASSERT_OK(db_->CreateColumnFamilies(cf_options, cf_names, &cf_handles)); + for (int i = 1; i <= kNumCF; i++) { + ASSERT_OK(db_->Put(write_options, cf_handles[i - 1], "foo", "bar")); + } + ASSERT_OK(db_->DropColumnFamilies(cf_handles)); + std::vector cf_descriptors; + for (auto* handle : cf_handles) { + delete handle; + } + cf_handles.clear(); + for (int i = 1; i <= kNumCF; i++) { + cf_descriptors.emplace_back("cf2-" + ToString(i), ColumnFamilyOptions()); + } + ASSERT_OK(db_->CreateColumnFamilies(cf_descriptors, &cf_handles)); + for (int i = 1; i <= kNumCF; i++) { + ASSERT_OK(db_->Put(write_options, cf_handles[i - 1], "foo", "bar")); + } + ASSERT_OK(db_->DropColumnFamilies(cf_handles)); + for (auto* handle : cf_handles) { + delete handle; + } + Close(); + std::vector families; + ASSERT_OK(DB::ListColumnFamilies(db_options_, dbname_, &families)); + std::sort(families.begin(), families.end()); + ASSERT_TRUE(families == std::vector({"default"})); +} + +TEST_P(ColumnFamilyTest, DropTest) { + // first iteration - dont reopen DB before dropping + // second iteration - reopen DB before dropping + for (int iter = 0; iter < 2; ++iter) { + Open({"default"}); + CreateColumnFamiliesAndReopen({"pikachu"}); + for (int i = 0; i < 100; ++i) { + ASSERT_OK(Put(1, ToString(i), "bar" + ToString(i))); + } + ASSERT_OK(Flush(1)); + + if (iter == 1) { + Reopen(); + } + ASSERT_EQ("bar1", Get(1, "1")); + + AssertCountLiveFiles(1); + DropColumnFamilies({1}); + // make sure that all files are deleted when we drop the column family + AssertCountLiveFiles(0); + Destroy(); + } +} + +TEST_P(ColumnFamilyTest, WriteBatchFailure) { + Open(); + CreateColumnFamiliesAndReopen({"one", "two"}); + WriteBatch batch; + batch.Put(handles_[0], Slice("existing"), Slice("column-family")); + batch.Put(handles_[1], Slice("non-existing"), Slice("column-family")); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + DropColumnFamilies({1}); + WriteOptions woptions_ignore_missing_cf; + woptions_ignore_missing_cf.ignore_missing_column_families = true; + batch.Put(handles_[0], Slice("still here"), Slice("column-family")); + ASSERT_OK(db_->Write(woptions_ignore_missing_cf, &batch)); + ASSERT_EQ("column-family", Get(0, "still here")); + Status s = db_->Write(WriteOptions(), &batch); + ASSERT_TRUE(s.IsInvalidArgument()); + Close(); +} + +TEST_P(ColumnFamilyTest, ReadWrite) { + Open(); + CreateColumnFamiliesAndReopen({"one", "two"}); + ASSERT_OK(Put(0, "foo", "v1")); + ASSERT_OK(Put(0, "bar", "v2")); + ASSERT_OK(Put(1, "mirko", "v3")); + ASSERT_OK(Put(0, "foo", "v2")); + ASSERT_OK(Put(2, "fodor", "v5")); + + for (int iter = 0; iter <= 3; ++iter) { + ASSERT_EQ("v2", Get(0, "foo")); + ASSERT_EQ("v2", Get(0, "bar")); + ASSERT_EQ("v3", Get(1, "mirko")); + ASSERT_EQ("v5", Get(2, "fodor")); + ASSERT_EQ("NOT_FOUND", Get(0, "fodor")); + ASSERT_EQ("NOT_FOUND", Get(1, "fodor")); + ASSERT_EQ("NOT_FOUND", Get(2, "foo")); + if (iter <= 1) { + Reopen(); + } + } + Close(); +} + +TEST_P(ColumnFamilyTest, IgnoreRecoveredLog) { + std::string backup_logs = dbname_ + "/backup_logs"; + + // delete old files in backup_logs directory + ASSERT_OK(env_->CreateDirIfMissing(dbname_)); + ASSERT_OK(env_->CreateDirIfMissing(backup_logs)); + std::vector old_files; + env_->GetChildren(backup_logs, &old_files); + for (auto& file : old_files) { + if (file != "." && file != "..") { + env_->DeleteFile(backup_logs + "/" + file); + } + } + + column_family_options_.merge_operator = + MergeOperators::CreateUInt64AddOperator(); + db_options_.wal_dir = dbname_ + "/logs"; + Destroy(); + Open(); + CreateColumnFamilies({"cf1", "cf2"}); + + // fill up the DB + std::string one, two, three; + PutFixed64(&one, 1); + PutFixed64(&two, 2); + PutFixed64(&three, 3); + ASSERT_OK(Merge(0, "foo", one)); + ASSERT_OK(Merge(1, "mirko", one)); + ASSERT_OK(Merge(0, "foo", one)); + ASSERT_OK(Merge(2, "bla", one)); + ASSERT_OK(Merge(2, "fodor", one)); + ASSERT_OK(Merge(0, "bar", one)); + ASSERT_OK(Merge(2, "bla", one)); + ASSERT_OK(Merge(1, "mirko", two)); + ASSERT_OK(Merge(1, "franjo", one)); + + // copy the logs to backup + std::vector logs; + env_->GetChildren(db_options_.wal_dir, &logs); + for (auto& log : logs) { + if (log != ".." && log != ".") { + CopyFile(db_options_.wal_dir + "/" + log, backup_logs + "/" + log); + } + } + + // recover the DB + Close(); + + // 1. check consistency + // 2. copy the logs from backup back to WAL dir. if the recovery happens + // again on the same log files, this should lead to incorrect results + // due to applying merge operator twice + // 3. check consistency + for (int iter = 0; iter < 2; ++iter) { + // assert consistency + Open({"default", "cf1", "cf2"}); + ASSERT_EQ(two, Get(0, "foo")); + ASSERT_EQ(one, Get(0, "bar")); + ASSERT_EQ(three, Get(1, "mirko")); + ASSERT_EQ(one, Get(1, "franjo")); + ASSERT_EQ(one, Get(2, "fodor")); + ASSERT_EQ(two, Get(2, "bla")); + Close(); + + if (iter == 0) { + // copy the logs from backup back to wal dir + for (auto& log : logs) { + if (log != ".." && log != ".") { + CopyFile(backup_logs + "/" + log, db_options_.wal_dir + "/" + log); + } + } + } + } +} + +#ifndef ROCKSDB_LITE // TEST functions used are not supported +TEST_P(ColumnFamilyTest, FlushTest) { + Open(); + CreateColumnFamiliesAndReopen({"one", "two"}); + ASSERT_OK(Put(0, "foo", "v1")); + ASSERT_OK(Put(0, "bar", "v2")); + ASSERT_OK(Put(1, "mirko", "v3")); + ASSERT_OK(Put(0, "foo", "v2")); + ASSERT_OK(Put(2, "fodor", "v5")); + + for (int j = 0; j < 2; j++) { + ReadOptions ro; + std::vector iterators; + // Hold super version. + if (j == 0) { + ASSERT_OK(db_->NewIterators(ro, handles_, &iterators)); + } + + for (int i = 0; i < 3; ++i) { + uint64_t max_total_in_memory_state = + MaxTotalInMemoryState(); + Flush(i); + AssertMaxTotalInMemoryState(max_total_in_memory_state); + } + ASSERT_OK(Put(1, "foofoo", "bar")); + ASSERT_OK(Put(0, "foofoo", "bar")); + + for (auto* it : iterators) { + delete it; + } + } + Reopen(); + + for (int iter = 0; iter <= 2; ++iter) { + ASSERT_EQ("v2", Get(0, "foo")); + ASSERT_EQ("v2", Get(0, "bar")); + ASSERT_EQ("v3", Get(1, "mirko")); + ASSERT_EQ("v5", Get(2, "fodor")); + ASSERT_EQ("NOT_FOUND", Get(0, "fodor")); + ASSERT_EQ("NOT_FOUND", Get(1, "fodor")); + ASSERT_EQ("NOT_FOUND", Get(2, "foo")); + if (iter <= 1) { + Reopen(); + } + } + Close(); +} + +// Makes sure that obsolete log files get deleted +TEST_P(ColumnFamilyTest, LogDeletionTest) { + db_options_.max_total_wal_size = std::numeric_limits::max(); + column_family_options_.arena_block_size = 4 * 1024; + column_family_options_.write_buffer_size = 128000; // 128KB + Open(); + CreateColumnFamilies({"one", "two", "three", "four"}); + // Each bracket is one log file. if number is in (), it means + // we don't need it anymore (it's been flushed) + // [] + AssertCountLiveLogFiles(0); + PutRandomData(0, 1, 128); + // [0] + PutRandomData(1, 1, 128); + // [0, 1] + PutRandomData(1, 1000, 128); + WaitForFlush(1); + // [0, (1)] [1] + AssertCountLiveLogFiles(2); + PutRandomData(0, 1, 128); + // [0, (1)] [0, 1] + AssertCountLiveLogFiles(2); + PutRandomData(2, 1, 128); + // [0, (1)] [0, 1, 2] + PutRandomData(2, 1000, 128); + WaitForFlush(2); + // [0, (1)] [0, 1, (2)] [2] + AssertCountLiveLogFiles(3); + PutRandomData(2, 1000, 128); + WaitForFlush(2); + // [0, (1)] [0, 1, (2)] [(2)] [2] + AssertCountLiveLogFiles(4); + PutRandomData(3, 1, 128); + // [0, (1)] [0, 1, (2)] [(2)] [2, 3] + PutRandomData(1, 1, 128); + // [0, (1)] [0, 1, (2)] [(2)] [1, 2, 3] + AssertCountLiveLogFiles(4); + PutRandomData(1, 1000, 128); + WaitForFlush(1); + // [0, (1)] [0, (1), (2)] [(2)] [(1), 2, 3] [1] + AssertCountLiveLogFiles(5); + PutRandomData(0, 1000, 128); + WaitForFlush(0); + // [(0), (1)] [(0), (1), (2)] [(2)] [(1), 2, 3] [1, (0)] [0] + // delete obsolete logs --> + // [(1), 2, 3] [1, (0)] [0] + AssertCountLiveLogFiles(3); + PutRandomData(0, 1000, 128); + WaitForFlush(0); + // [(1), 2, 3] [1, (0)], [(0)] [0] + AssertCountLiveLogFiles(4); + PutRandomData(1, 1000, 128); + WaitForFlush(1); + // [(1), 2, 3] [(1), (0)] [(0)] [0, (1)] [1] + AssertCountLiveLogFiles(5); + PutRandomData(2, 1000, 128); + WaitForFlush(2); + // [(1), (2), 3] [(1), (0)] [(0)] [0, (1)] [1, (2)], [2] + AssertCountLiveLogFiles(6); + PutRandomData(3, 1000, 128); + WaitForFlush(3); + // [(1), (2), (3)] [(1), (0)] [(0)] [0, (1)] [1, (2)], [2, (3)] [3] + // delete obsolete logs --> + // [0, (1)] [1, (2)], [2, (3)] [3] + AssertCountLiveLogFiles(4); + Close(); +} +#endif // !ROCKSDB_LITE + +TEST_P(ColumnFamilyTest, CrashAfterFlush) { + std::unique_ptr fault_env( + new FaultInjectionTestEnv(env_)); + db_options_.env = fault_env.get(); + Open(); + CreateColumnFamilies({"one"}); + + WriteBatch batch; + batch.Put(handles_[0], Slice("foo"), Slice("bar")); + batch.Put(handles_[1], Slice("foo"), Slice("bar")); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + Flush(0); + fault_env->SetFilesystemActive(false); + + std::vector names; + for (auto name : names_) { + if (name != "") { + names.push_back(name); + } + } + Close(); + fault_env->DropUnsyncedFileData(); + fault_env->ResetState(); + Open(names, {}); + + // Write batch should be atomic. + ASSERT_EQ(Get(0, "foo"), Get(1, "foo")); + + Close(); + db_options_.env = env_; +} + +TEST_P(ColumnFamilyTest, OpenNonexistentColumnFamily) { + ASSERT_OK(TryOpen({"default"})); + Close(); + ASSERT_TRUE(TryOpen({"default", "dne"}).IsInvalidArgument()); +} + +#ifndef ROCKSDB_LITE // WaitForFlush() is not supported +// Makes sure that obsolete log files get deleted +TEST_P(ColumnFamilyTest, DifferentWriteBufferSizes) { + // disable flushing stale column families + db_options_.max_total_wal_size = std::numeric_limits::max(); + Open(); + CreateColumnFamilies({"one", "two", "three"}); + ColumnFamilyOptions default_cf, one, two, three; + // setup options. all column families have max_write_buffer_number setup to 10 + // "default" -> 100KB memtable, start flushing immediatelly + // "one" -> 200KB memtable, start flushing with two immutable memtables + // "two" -> 1MB memtable, start flushing with three immutable memtables + // "three" -> 90KB memtable, start flushing with four immutable memtables + default_cf.write_buffer_size = 100000; + default_cf.arena_block_size = 4 * 4096; + default_cf.max_write_buffer_number = 10; + default_cf.min_write_buffer_number_to_merge = 1; + default_cf.max_write_buffer_size_to_maintain = 0; + one.write_buffer_size = 200000; + one.arena_block_size = 4 * 4096; + one.max_write_buffer_number = 10; + one.min_write_buffer_number_to_merge = 2; + one.max_write_buffer_size_to_maintain = + static_cast(one.write_buffer_size); + two.write_buffer_size = 1000000; + two.arena_block_size = 4 * 4096; + two.max_write_buffer_number = 10; + two.min_write_buffer_number_to_merge = 3; + two.max_write_buffer_size_to_maintain = + static_cast(two.write_buffer_size); + three.write_buffer_size = 4096 * 22; + three.arena_block_size = 4096; + three.max_write_buffer_number = 10; + three.min_write_buffer_number_to_merge = 4; + three.max_write_buffer_size_to_maintain = + static_cast(three.write_buffer_size); + + Reopen({default_cf, one, two, three}); + + int micros_wait_for_flush = 10000; + PutRandomData(0, 100, 1000); + WaitForFlush(0); + AssertNumberOfImmutableMemtables({0, 0, 0, 0}); + AssertCountLiveLogFiles(1); + PutRandomData(1, 200, 1000); + env_->SleepForMicroseconds(micros_wait_for_flush); + AssertNumberOfImmutableMemtables({0, 1, 0, 0}); + AssertCountLiveLogFiles(2); + PutRandomData(2, 1000, 1000); + env_->SleepForMicroseconds(micros_wait_for_flush); + AssertNumberOfImmutableMemtables({0, 1, 1, 0}); + AssertCountLiveLogFiles(3); + PutRandomData(2, 1000, 1000); + env_->SleepForMicroseconds(micros_wait_for_flush); + AssertNumberOfImmutableMemtables({0, 1, 2, 0}); + AssertCountLiveLogFiles(4); + PutRandomData(3, 93, 990); + env_->SleepForMicroseconds(micros_wait_for_flush); + AssertNumberOfImmutableMemtables({0, 1, 2, 1}); + AssertCountLiveLogFiles(5); + PutRandomData(3, 88, 990); + env_->SleepForMicroseconds(micros_wait_for_flush); + AssertNumberOfImmutableMemtables({0, 1, 2, 2}); + AssertCountLiveLogFiles(6); + PutRandomData(3, 88, 990); + env_->SleepForMicroseconds(micros_wait_for_flush); + AssertNumberOfImmutableMemtables({0, 1, 2, 3}); + AssertCountLiveLogFiles(7); + PutRandomData(0, 100, 1000); + WaitForFlush(0); + AssertNumberOfImmutableMemtables({0, 1, 2, 3}); + AssertCountLiveLogFiles(8); + PutRandomData(2, 100, 10000); + WaitForFlush(2); + AssertNumberOfImmutableMemtables({0, 1, 0, 3}); + AssertCountLiveLogFiles(9); + PutRandomData(3, 88, 990); + WaitForFlush(3); + AssertNumberOfImmutableMemtables({0, 1, 0, 0}); + AssertCountLiveLogFiles(10); + PutRandomData(3, 88, 990); + env_->SleepForMicroseconds(micros_wait_for_flush); + AssertNumberOfImmutableMemtables({0, 1, 0, 1}); + AssertCountLiveLogFiles(11); + PutRandomData(1, 200, 1000); + WaitForFlush(1); + AssertNumberOfImmutableMemtables({0, 0, 0, 1}); + AssertCountLiveLogFiles(5); + PutRandomData(3, 88 * 3, 990); + WaitForFlush(3); + PutRandomData(3, 88 * 4, 990); + WaitForFlush(3); + AssertNumberOfImmutableMemtables({0, 0, 0, 0}); + AssertCountLiveLogFiles(12); + PutRandomData(0, 100, 1000); + WaitForFlush(0); + AssertNumberOfImmutableMemtables({0, 0, 0, 0}); + AssertCountLiveLogFiles(12); + PutRandomData(2, 3 * 1000, 1000); + WaitForFlush(2); + AssertNumberOfImmutableMemtables({0, 0, 0, 0}); + AssertCountLiveLogFiles(12); + PutRandomData(1, 2*200, 1000); + WaitForFlush(1); + AssertNumberOfImmutableMemtables({0, 0, 0, 0}); + AssertCountLiveLogFiles(7); + Close(); +} +#endif // !ROCKSDB_LITE + +// The test is commented out because we want to test that snapshot is +// not created for memtables not supported it, but There isn't a memtable +// that doesn't support snapshot right now. If we have one later, we can +// re-enable the test. +// +// #ifndef ROCKSDB_LITE // Cuckoo is not supported in lite +// TEST_P(ColumnFamilyTest, MemtableNotSupportSnapshot) { +// db_options_.allow_concurrent_memtable_write = false; +// Open(); +// auto* s1 = dbfull()->GetSnapshot(); +// ASSERT_TRUE(s1 != nullptr); +// dbfull()->ReleaseSnapshot(s1); + +// // Add a column family that doesn't support snapshot +// ColumnFamilyOptions first; +// first.memtable_factory.reset(new DummyMemtableNotSupportingSnapshot()); +// CreateColumnFamilies({"first"}, {first}); +// auto* s2 = dbfull()->GetSnapshot(); +// ASSERT_TRUE(s2 == nullptr); + +// // Add a column family that supports snapshot. Snapshot stays not +// supported. ColumnFamilyOptions second; CreateColumnFamilies({"second"}, +// {second}); auto* s3 = dbfull()->GetSnapshot(); ASSERT_TRUE(s3 == nullptr); +// Close(); +// } +// #endif // !ROCKSDB_LITE + +class TestComparator : public Comparator { + int Compare(const ROCKSDB_NAMESPACE::Slice& /*a*/, + const ROCKSDB_NAMESPACE::Slice& /*b*/) const override { + return 0; + } + const char* Name() const override { return "Test"; } + void FindShortestSeparator( + std::string* /*start*/, + const ROCKSDB_NAMESPACE::Slice& /*limit*/) const override {} + void FindShortSuccessor(std::string* /*key*/) const override {} +}; + +static TestComparator third_comparator; +static TestComparator fourth_comparator; + +// Test that we can retrieve the comparator from a created CF +TEST_P(ColumnFamilyTest, GetComparator) { + Open(); + // Add a column family with no comparator specified + CreateColumnFamilies({"first"}); + const Comparator* comp = handles_[0]->GetComparator(); + ASSERT_EQ(comp, BytewiseComparator()); + + // Add three column families - one with no comparator and two + // with comparators specified + ColumnFamilyOptions second, third, fourth; + second.comparator = &third_comparator; + third.comparator = &fourth_comparator; + CreateColumnFamilies({"second", "third", "fourth"}, {second, third, fourth}); + ASSERT_EQ(handles_[1]->GetComparator(), BytewiseComparator()); + ASSERT_EQ(handles_[2]->GetComparator(), &third_comparator); + ASSERT_EQ(handles_[3]->GetComparator(), &fourth_comparator); + Close(); +} + +TEST_P(ColumnFamilyTest, DifferentMergeOperators) { + Open(); + CreateColumnFamilies({"first", "second"}); + ColumnFamilyOptions default_cf, first, second; + first.merge_operator = MergeOperators::CreateUInt64AddOperator(); + second.merge_operator = MergeOperators::CreateStringAppendOperator(); + Reopen({default_cf, first, second}); + + std::string one, two, three; + PutFixed64(&one, 1); + PutFixed64(&two, 2); + PutFixed64(&three, 3); + + ASSERT_OK(Put(0, "foo", two)); + ASSERT_OK(Put(0, "foo", one)); + ASSERT_TRUE(Merge(0, "foo", two).IsNotSupported()); + ASSERT_EQ(Get(0, "foo"), one); + + ASSERT_OK(Put(1, "foo", two)); + ASSERT_OK(Put(1, "foo", one)); + ASSERT_OK(Merge(1, "foo", two)); + ASSERT_EQ(Get(1, "foo"), three); + + ASSERT_OK(Put(2, "foo", two)); + ASSERT_OK(Put(2, "foo", one)); + ASSERT_OK(Merge(2, "foo", two)); + ASSERT_EQ(Get(2, "foo"), one + "," + two); + Close(); +} + +#ifndef ROCKSDB_LITE // WaitForFlush() is not supported +TEST_P(ColumnFamilyTest, DifferentCompactionStyles) { + Open(); + CreateColumnFamilies({"one", "two"}); + ColumnFamilyOptions default_cf, one, two; + db_options_.max_open_files = 20; // only 10 files in file cache + + default_cf.compaction_style = kCompactionStyleLevel; + default_cf.num_levels = 3; + default_cf.write_buffer_size = 64 << 10; // 64KB + default_cf.target_file_size_base = 30 << 10; + default_cf.max_compaction_bytes = static_cast(1) << 60; + + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + table_options.no_block_cache = true; + default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + one.compaction_style = kCompactionStyleUniversal; + + one.num_levels = 1; + // trigger compaction if there are >= 4 files + one.level0_file_num_compaction_trigger = 4; + one.write_buffer_size = 120000; + + two.compaction_style = kCompactionStyleLevel; + two.num_levels = 4; + two.level0_file_num_compaction_trigger = 3; + two.write_buffer_size = 100000; + + Reopen({default_cf, one, two}); + + // SETUP column family "one" -- universal style + for (int i = 0; i < one.level0_file_num_compaction_trigger - 1; ++i) { + PutRandomData(1, 10, 12000); + PutRandomData(1, 1, 10); + WaitForFlush(1); + AssertFilesPerLevel(ToString(i + 1), 1); + } + + // SETUP column family "two" -- level style with 4 levels + for (int i = 0; i < two.level0_file_num_compaction_trigger - 1; ++i) { + PutRandomData(2, 10, 12000); + PutRandomData(2, 1, 10); + WaitForFlush(2); + AssertFilesPerLevel(ToString(i + 1), 2); + } + + // TRIGGER compaction "one" + PutRandomData(1, 10, 12000); + PutRandomData(1, 1, 10); + + // TRIGGER compaction "two" + PutRandomData(2, 10, 12000); + PutRandomData(2, 1, 10); + + // WAIT for compactions + WaitForCompaction(); + + // VERIFY compaction "one" + AssertFilesPerLevel("1", 1); + + // VERIFY compaction "two" + AssertFilesPerLevel("0,1", 2); + CompactAll(2); + AssertFilesPerLevel("0,1", 2); + + Close(); +} +#endif // !ROCKSDB_LITE + +#ifndef ROCKSDB_LITE +// Sync points not supported in RocksDB Lite + +TEST_P(ColumnFamilyTest, MultipleManualCompactions) { + Open(); + CreateColumnFamilies({"one", "two"}); + ColumnFamilyOptions default_cf, one, two; + db_options_.max_open_files = 20; // only 10 files in file cache + db_options_.max_background_compactions = 3; + + default_cf.compaction_style = kCompactionStyleLevel; + default_cf.num_levels = 3; + default_cf.write_buffer_size = 64 << 10; // 64KB + default_cf.target_file_size_base = 30 << 10; + default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100; + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + table_options.no_block_cache = true; + default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + one.compaction_style = kCompactionStyleUniversal; + + one.num_levels = 1; + // trigger compaction if there are >= 4 files + one.level0_file_num_compaction_trigger = 4; + one.write_buffer_size = 120000; + + two.compaction_style = kCompactionStyleLevel; + two.num_levels = 4; + two.level0_file_num_compaction_trigger = 3; + two.write_buffer_size = 100000; + + Reopen({default_cf, one, two}); + + // SETUP column family "one" -- universal style + for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) { + PutRandomData(1, 10, 12000, true); + PutRandomData(1, 1, 10, true); + WaitForFlush(1); + AssertFilesPerLevel(ToString(i + 1), 1); + } + bool cf_1_1 = true; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"ColumnFamilyTest::MultiManual:4", "ColumnFamilyTest::MultiManual:1"}, + {"ColumnFamilyTest::MultiManual:2", "ColumnFamilyTest::MultiManual:5"}, + {"ColumnFamilyTest::MultiManual:2", "ColumnFamilyTest::MultiManual:3"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) { + if (cf_1_1) { + TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:4"); + cf_1_1 = false; + TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:3"); + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + std::vector threads; + threads.emplace_back([&] { + CompactRangeOptions compact_options; + compact_options.exclusive_manual_compaction = false; + ASSERT_OK( + db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); + }); + + // SETUP column family "two" -- level style with 4 levels + for (int i = 0; i < two.level0_file_num_compaction_trigger - 2; ++i) { + PutRandomData(2, 10, 12000); + PutRandomData(2, 1, 10); + WaitForFlush(2); + AssertFilesPerLevel(ToString(i + 1), 2); + } + threads.emplace_back([&] { + TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:1"); + CompactRangeOptions compact_options; + compact_options.exclusive_manual_compaction = false; + ASSERT_OK( + db_->CompactRange(compact_options, handles_[2], nullptr, nullptr)); + TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:2"); + }); + + TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:5"); + for (auto& t : threads) { + t.join(); + } + + // VERIFY compaction "one" + AssertFilesPerLevel("1", 1); + + // VERIFY compaction "two" + AssertFilesPerLevel("0,1", 2); + CompactAll(2); + AssertFilesPerLevel("0,1", 2); + // Compare against saved keys + std::set::iterator key_iter = keys_[1].begin(); + while (key_iter != keys_[1].end()) { + ASSERT_NE("NOT_FOUND", Get(1, *key_iter)); + key_iter++; + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + Close(); +} + +TEST_P(ColumnFamilyTest, AutomaticAndManualCompactions) { + Open(); + CreateColumnFamilies({"one", "two"}); + ColumnFamilyOptions default_cf, one, two; + db_options_.max_open_files = 20; // only 10 files in file cache + db_options_.max_background_compactions = 3; + + default_cf.compaction_style = kCompactionStyleLevel; + default_cf.num_levels = 3; + default_cf.write_buffer_size = 64 << 10; // 64KB + default_cf.target_file_size_base = 30 << 10; + default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100; + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + ; + table_options.no_block_cache = true; + default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + one.compaction_style = kCompactionStyleUniversal; + + one.num_levels = 1; + // trigger compaction if there are >= 4 files + one.level0_file_num_compaction_trigger = 4; + one.write_buffer_size = 120000; + + two.compaction_style = kCompactionStyleLevel; + two.num_levels = 4; + two.level0_file_num_compaction_trigger = 3; + two.write_buffer_size = 100000; + + Reopen({default_cf, one, two}); + // make sure all background compaction jobs can be scheduled + auto stop_token = + dbfull()->TEST_write_controler().GetCompactionPressureToken(); + + bool cf_1_1 = true; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"ColumnFamilyTest::AutoManual:4", "ColumnFamilyTest::AutoManual:1"}, + {"ColumnFamilyTest::AutoManual:2", "ColumnFamilyTest::AutoManual:5"}, + {"ColumnFamilyTest::AutoManual:2", "ColumnFamilyTest::AutoManual:3"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) { + if (cf_1_1) { + cf_1_1 = false; + TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:4"); + TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:3"); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + // SETUP column family "one" -- universal style + for (int i = 0; i < one.level0_file_num_compaction_trigger; ++i) { + PutRandomData(1, 10, 12000, true); + PutRandomData(1, 1, 10, true); + WaitForFlush(1); + AssertFilesPerLevel(ToString(i + 1), 1); + } + + TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:1"); + + // SETUP column family "two" -- level style with 4 levels + for (int i = 0; i < two.level0_file_num_compaction_trigger - 2; ++i) { + PutRandomData(2, 10, 12000); + PutRandomData(2, 1, 10); + WaitForFlush(2); + AssertFilesPerLevel(ToString(i + 1), 2); + } + ROCKSDB_NAMESPACE::port::Thread threads([&] { + CompactRangeOptions compact_options; + compact_options.exclusive_manual_compaction = false; + ASSERT_OK( + db_->CompactRange(compact_options, handles_[2], nullptr, nullptr)); + TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:2"); + }); + + TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:5"); + threads.join(); + + // WAIT for compactions + WaitForCompaction(); + + // VERIFY compaction "one" + AssertFilesPerLevel("1", 1); + + // VERIFY compaction "two" + AssertFilesPerLevel("0,1", 2); + CompactAll(2); + AssertFilesPerLevel("0,1", 2); + // Compare against saved keys + std::set::iterator key_iter = keys_[1].begin(); + while (key_iter != keys_[1].end()) { + ASSERT_NE("NOT_FOUND", Get(1, *key_iter)); + key_iter++; + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(ColumnFamilyTest, ManualAndAutomaticCompactions) { + Open(); + CreateColumnFamilies({"one", "two"}); + ColumnFamilyOptions default_cf, one, two; + db_options_.max_open_files = 20; // only 10 files in file cache + db_options_.max_background_compactions = 3; + + default_cf.compaction_style = kCompactionStyleLevel; + default_cf.num_levels = 3; + default_cf.write_buffer_size = 64 << 10; // 64KB + default_cf.target_file_size_base = 30 << 10; + default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100; + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + ; + table_options.no_block_cache = true; + default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + one.compaction_style = kCompactionStyleUniversal; + + one.num_levels = 1; + // trigger compaction if there are >= 4 files + one.level0_file_num_compaction_trigger = 4; + one.write_buffer_size = 120000; + + two.compaction_style = kCompactionStyleLevel; + two.num_levels = 4; + two.level0_file_num_compaction_trigger = 3; + two.write_buffer_size = 100000; + + Reopen({default_cf, one, two}); + // make sure all background compaction jobs can be scheduled + auto stop_token = + dbfull()->TEST_write_controler().GetCompactionPressureToken(); + + // SETUP column family "one" -- universal style + for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) { + PutRandomData(1, 10, 12000, true); + PutRandomData(1, 1, 10, true); + WaitForFlush(1); + AssertFilesPerLevel(ToString(i + 1), 1); + } + bool cf_1_1 = true; + bool cf_1_2 = true; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:1"}, + {"ColumnFamilyTest::ManualAuto:5", "ColumnFamilyTest::ManualAuto:2"}, + {"ColumnFamilyTest::ManualAuto:2", "ColumnFamilyTest::ManualAuto:3"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) { + if (cf_1_1) { + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:4"); + cf_1_1 = false; + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:3"); + } else if (cf_1_2) { + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:2"); + cf_1_2 = false; + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ROCKSDB_NAMESPACE::port::Thread threads([&] { + CompactRangeOptions compact_options; + compact_options.exclusive_manual_compaction = false; + ASSERT_OK( + db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); + }); + + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:1"); + + // SETUP column family "two" -- level style with 4 levels + for (int i = 0; i < two.level0_file_num_compaction_trigger; ++i) { + PutRandomData(2, 10, 12000); + PutRandomData(2, 1, 10); + WaitForFlush(2); + AssertFilesPerLevel(ToString(i + 1), 2); + } + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:5"); + threads.join(); + + // WAIT for compactions + WaitForCompaction(); + + // VERIFY compaction "one" + AssertFilesPerLevel("1", 1); + + // VERIFY compaction "two" + AssertFilesPerLevel("0,1", 2); + CompactAll(2); + AssertFilesPerLevel("0,1", 2); + // Compare against saved keys + std::set::iterator key_iter = keys_[1].begin(); + while (key_iter != keys_[1].end()) { + ASSERT_NE("NOT_FOUND", Get(1, *key_iter)); + key_iter++; + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(ColumnFamilyTest, SameCFManualManualCompactions) { + Open(); + CreateColumnFamilies({"one"}); + ColumnFamilyOptions default_cf, one; + db_options_.max_open_files = 20; // only 10 files in file cache + db_options_.max_background_compactions = 3; + + default_cf.compaction_style = kCompactionStyleLevel; + default_cf.num_levels = 3; + default_cf.write_buffer_size = 64 << 10; // 64KB + default_cf.target_file_size_base = 30 << 10; + default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100; + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + ; + table_options.no_block_cache = true; + default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + one.compaction_style = kCompactionStyleUniversal; + + one.num_levels = 1; + // trigger compaction if there are >= 4 files + one.level0_file_num_compaction_trigger = 4; + one.write_buffer_size = 120000; + + Reopen({default_cf, one}); + // make sure all background compaction jobs can be scheduled + auto stop_token = + dbfull()->TEST_write_controler().GetCompactionPressureToken(); + + // SETUP column family "one" -- universal style + for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) { + PutRandomData(1, 10, 12000, true); + PutRandomData(1, 1, 10, true); + WaitForFlush(1); + AssertFilesPerLevel(ToString(i + 1), 1); + } + bool cf_1_1 = true; + bool cf_1_2 = true; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"ColumnFamilyTest::ManualManual:4", "ColumnFamilyTest::ManualManual:2"}, + {"ColumnFamilyTest::ManualManual:4", "ColumnFamilyTest::ManualManual:5"}, + {"ColumnFamilyTest::ManualManual:1", "ColumnFamilyTest::ManualManual:2"}, + {"ColumnFamilyTest::ManualManual:1", + "ColumnFamilyTest::ManualManual:3"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) { + if (cf_1_1) { + TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:4"); + cf_1_1 = false; + TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:3"); + } else if (cf_1_2) { + TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:2"); + cf_1_2 = false; + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ROCKSDB_NAMESPACE::port::Thread threads([&] { + CompactRangeOptions compact_options; + compact_options.exclusive_manual_compaction = true; + ASSERT_OK( + db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); + }); + + TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:5"); + + WaitForFlush(1); + + // Add more L0 files and force another manual compaction + for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) { + PutRandomData(1, 10, 12000, true); + PutRandomData(1, 1, 10, true); + WaitForFlush(1); + AssertFilesPerLevel(ToString(one.level0_file_num_compaction_trigger + i), + 1); + } + + ROCKSDB_NAMESPACE::port::Thread threads1([&] { + CompactRangeOptions compact_options; + compact_options.exclusive_manual_compaction = false; + ASSERT_OK( + db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); + }); + + TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:1"); + + threads.join(); + threads1.join(); + WaitForCompaction(); + // VERIFY compaction "one" + ASSERT_LE(NumTableFilesAtLevel(0, 1), 2); + + // Compare against saved keys + std::set::iterator key_iter = keys_[1].begin(); + while (key_iter != keys_[1].end()) { + ASSERT_NE("NOT_FOUND", Get(1, *key_iter)); + key_iter++; + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(ColumnFamilyTest, SameCFManualAutomaticCompactions) { + Open(); + CreateColumnFamilies({"one"}); + ColumnFamilyOptions default_cf, one; + db_options_.max_open_files = 20; // only 10 files in file cache + db_options_.max_background_compactions = 3; + + default_cf.compaction_style = kCompactionStyleLevel; + default_cf.num_levels = 3; + default_cf.write_buffer_size = 64 << 10; // 64KB + default_cf.target_file_size_base = 30 << 10; + default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100; + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + ; + table_options.no_block_cache = true; + default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + one.compaction_style = kCompactionStyleUniversal; + + one.num_levels = 1; + // trigger compaction if there are >= 4 files + one.level0_file_num_compaction_trigger = 4; + one.write_buffer_size = 120000; + + Reopen({default_cf, one}); + // make sure all background compaction jobs can be scheduled + auto stop_token = + dbfull()->TEST_write_controler().GetCompactionPressureToken(); + + // SETUP column family "one" -- universal style + for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) { + PutRandomData(1, 10, 12000, true); + PutRandomData(1, 1, 10, true); + WaitForFlush(1); + AssertFilesPerLevel(ToString(i + 1), 1); + } + bool cf_1_1 = true; + bool cf_1_2 = true; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:2"}, + {"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:5"}, + {"ColumnFamilyTest::ManualAuto:1", "ColumnFamilyTest::ManualAuto:2"}, + {"ColumnFamilyTest::ManualAuto:1", "ColumnFamilyTest::ManualAuto:3"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) { + if (cf_1_1) { + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:4"); + cf_1_1 = false; + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:3"); + } else if (cf_1_2) { + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:2"); + cf_1_2 = false; + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ROCKSDB_NAMESPACE::port::Thread threads([&] { + CompactRangeOptions compact_options; + compact_options.exclusive_manual_compaction = false; + ASSERT_OK( + db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); + }); + + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:5"); + + WaitForFlush(1); + + // Add more L0 files and force automatic compaction + for (int i = 0; i < one.level0_file_num_compaction_trigger; ++i) { + PutRandomData(1, 10, 12000, true); + PutRandomData(1, 1, 10, true); + WaitForFlush(1); + AssertFilesPerLevel(ToString(one.level0_file_num_compaction_trigger + i), + 1); + } + + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:1"); + + threads.join(); + WaitForCompaction(); + // VERIFY compaction "one" + ASSERT_LE(NumTableFilesAtLevel(0, 1), 2); + + // Compare against saved keys + std::set::iterator key_iter = keys_[1].begin(); + while (key_iter != keys_[1].end()) { + ASSERT_NE("NOT_FOUND", Get(1, *key_iter)); + key_iter++; + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(ColumnFamilyTest, SameCFManualAutomaticCompactionsLevel) { + Open(); + CreateColumnFamilies({"one"}); + ColumnFamilyOptions default_cf, one; + db_options_.max_open_files = 20; // only 10 files in file cache + db_options_.max_background_compactions = 3; + + default_cf.compaction_style = kCompactionStyleLevel; + default_cf.num_levels = 3; + default_cf.write_buffer_size = 64 << 10; // 64KB + default_cf.target_file_size_base = 30 << 10; + default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100; + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + ; + table_options.no_block_cache = true; + default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + one.compaction_style = kCompactionStyleLevel; + + one.num_levels = 1; + // trigger compaction if there are >= 4 files + one.level0_file_num_compaction_trigger = 3; + one.write_buffer_size = 120000; + + Reopen({default_cf, one}); + // make sure all background compaction jobs can be scheduled + auto stop_token = + dbfull()->TEST_write_controler().GetCompactionPressureToken(); + + // SETUP column family "one" -- level style + for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) { + PutRandomData(1, 10, 12000, true); + PutRandomData(1, 1, 10, true); + WaitForFlush(1); + AssertFilesPerLevel(ToString(i + 1), 1); + } + bool cf_1_1 = true; + bool cf_1_2 = true; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:2"}, + {"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:5"}, + {"ColumnFamilyTest::ManualAuto:3", "ColumnFamilyTest::ManualAuto:2"}, + {"LevelCompactionPicker::PickCompactionBySize:0", + "ColumnFamilyTest::ManualAuto:3"}, + {"ColumnFamilyTest::ManualAuto:1", "ColumnFamilyTest::ManualAuto:3"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) { + if (cf_1_1) { + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:4"); + cf_1_1 = false; + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:3"); + } else if (cf_1_2) { + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:2"); + cf_1_2 = false; + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ROCKSDB_NAMESPACE::port::Thread threads([&] { + CompactRangeOptions compact_options; + compact_options.exclusive_manual_compaction = false; + ASSERT_OK( + db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); + }); + + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:5"); + + // Add more L0 files and force automatic compaction + for (int i = 0; i < one.level0_file_num_compaction_trigger; ++i) { + PutRandomData(1, 10, 12000, true); + PutRandomData(1, 1, 10, true); + WaitForFlush(1); + AssertFilesPerLevel(ToString(one.level0_file_num_compaction_trigger + i), + 1); + } + + TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:1"); + + threads.join(); + WaitForCompaction(); + // VERIFY compaction "one" + AssertFilesPerLevel("0,1", 1); + + // Compare against saved keys + std::set::iterator key_iter = keys_[1].begin(); + while (key_iter != keys_[1].end()) { + ASSERT_NE("NOT_FOUND", Get(1, *key_iter)); + key_iter++; + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +// In this test, we generate enough files to trigger automatic compactions. +// The automatic compaction waits in NonTrivial:AfterRun +// We generate more files and then trigger an automatic compaction +// This will wait because the automatic compaction has files it needs. +// Once the conflict is hit, the automatic compaction starts and ends +// Then the manual will run and end. +TEST_P(ColumnFamilyTest, SameCFAutomaticManualCompactions) { + Open(); + CreateColumnFamilies({"one"}); + ColumnFamilyOptions default_cf, one; + db_options_.max_open_files = 20; // only 10 files in file cache + db_options_.max_background_compactions = 3; + + default_cf.compaction_style = kCompactionStyleLevel; + default_cf.num_levels = 3; + default_cf.write_buffer_size = 64 << 10; // 64KB + default_cf.target_file_size_base = 30 << 10; + default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100; + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + ; + table_options.no_block_cache = true; + default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + one.compaction_style = kCompactionStyleUniversal; + + one.num_levels = 1; + // trigger compaction if there are >= 4 files + one.level0_file_num_compaction_trigger = 4; + one.write_buffer_size = 120000; + + Reopen({default_cf, one}); + // make sure all background compaction jobs can be scheduled + auto stop_token = + dbfull()->TEST_write_controler().GetCompactionPressureToken(); + + bool cf_1_1 = true; + bool cf_1_2 = true; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"ColumnFamilyTest::AutoManual:4", "ColumnFamilyTest::AutoManual:2"}, + {"ColumnFamilyTest::AutoManual:4", "ColumnFamilyTest::AutoManual:5"}, + {"CompactionPicker::CompactRange:Conflict", + "ColumnFamilyTest::AutoManual:3"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) { + if (cf_1_1) { + TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:4"); + cf_1_1 = false; + TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:3"); + } else if (cf_1_2) { + TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:2"); + cf_1_2 = false; + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // SETUP column family "one" -- universal style + for (int i = 0; i < one.level0_file_num_compaction_trigger; ++i) { + PutRandomData(1, 10, 12000, true); + PutRandomData(1, 1, 10, true); + WaitForFlush(1); + AssertFilesPerLevel(ToString(i + 1), 1); + } + + TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:5"); + + // Add another L0 file and force automatic compaction + for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) { + PutRandomData(1, 10, 12000, true); + PutRandomData(1, 1, 10, true); + WaitForFlush(1); + } + + CompactRangeOptions compact_options; + compact_options.exclusive_manual_compaction = false; + ASSERT_OK(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); + + TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:1"); + + WaitForCompaction(); + // VERIFY compaction "one" + AssertFilesPerLevel("1", 1); + // Compare against saved keys + std::set::iterator key_iter = keys_[1].begin(); + while (key_iter != keys_[1].end()) { + ASSERT_NE("NOT_FOUND", Get(1, *key_iter)); + key_iter++; + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} +#endif // !ROCKSDB_LITE + +#ifndef ROCKSDB_LITE // Tailing iterator not supported +namespace { +std::string IterStatus(Iterator* iter) { + std::string result; + if (iter->Valid()) { + result = iter->key().ToString() + "->" + iter->value().ToString(); + } else { + result = "(invalid)"; + } + return result; +} +} // anonymous namespace + +TEST_P(ColumnFamilyTest, NewIteratorsTest) { + // iter == 0 -- no tailing + // iter == 2 -- tailing + for (int iter = 0; iter < 2; ++iter) { + Open(); + CreateColumnFamiliesAndReopen({"one", "two"}); + ASSERT_OK(Put(0, "a", "b")); + ASSERT_OK(Put(1, "b", "a")); + ASSERT_OK(Put(2, "c", "m")); + ASSERT_OK(Put(2, "v", "t")); + std::vector iterators; + ReadOptions options; + options.tailing = (iter == 1); + ASSERT_OK(db_->NewIterators(options, handles_, &iterators)); + + for (auto it : iterators) { + it->SeekToFirst(); + } + ASSERT_EQ(IterStatus(iterators[0]), "a->b"); + ASSERT_EQ(IterStatus(iterators[1]), "b->a"); + ASSERT_EQ(IterStatus(iterators[2]), "c->m"); + + ASSERT_OK(Put(1, "x", "x")); + + for (auto it : iterators) { + it->Next(); + } + + ASSERT_EQ(IterStatus(iterators[0]), "(invalid)"); + if (iter == 0) { + // no tailing + ASSERT_EQ(IterStatus(iterators[1]), "(invalid)"); + } else { + // tailing + ASSERT_EQ(IterStatus(iterators[1]), "x->x"); + } + ASSERT_EQ(IterStatus(iterators[2]), "v->t"); + + for (auto it : iterators) { + delete it; + } + Destroy(); + } +} +#endif // !ROCKSDB_LITE + +#ifndef ROCKSDB_LITE // ReadOnlyDB is not supported +TEST_P(ColumnFamilyTest, ReadOnlyDBTest) { + Open(); + CreateColumnFamiliesAndReopen({"one", "two", "three", "four"}); + ASSERT_OK(Put(0, "a", "b")); + ASSERT_OK(Put(1, "foo", "bla")); + ASSERT_OK(Put(2, "foo", "blabla")); + ASSERT_OK(Put(3, "foo", "blablabla")); + ASSERT_OK(Put(4, "foo", "blablablabla")); + + DropColumnFamilies({2}); + Close(); + // open only a subset of column families + AssertOpenReadOnly({"default", "one", "four"}); + ASSERT_EQ("NOT_FOUND", Get(0, "foo")); + ASSERT_EQ("bla", Get(1, "foo")); + ASSERT_EQ("blablablabla", Get(2, "foo")); + + + // test newiterators + { + std::vector iterators; + ASSERT_OK(db_->NewIterators(ReadOptions(), handles_, &iterators)); + for (auto it : iterators) { + it->SeekToFirst(); + } + ASSERT_EQ(IterStatus(iterators[0]), "a->b"); + ASSERT_EQ(IterStatus(iterators[1]), "foo->bla"); + ASSERT_EQ(IterStatus(iterators[2]), "foo->blablablabla"); + for (auto it : iterators) { + it->Next(); + } + ASSERT_EQ(IterStatus(iterators[0]), "(invalid)"); + ASSERT_EQ(IterStatus(iterators[1]), "(invalid)"); + ASSERT_EQ(IterStatus(iterators[2]), "(invalid)"); + + for (auto it : iterators) { + delete it; + } + } + + Close(); + // can't open dropped column family + Status s = OpenReadOnly({"default", "one", "two"}); + ASSERT_TRUE(!s.ok()); + + // Can't open without specifying default column family + s = OpenReadOnly({"one", "four"}); + ASSERT_TRUE(!s.ok()); +} +#endif // !ROCKSDB_LITE + +#ifndef ROCKSDB_LITE // WaitForFlush() is not supported in lite +TEST_P(ColumnFamilyTest, DontRollEmptyLogs) { + Open(); + CreateColumnFamiliesAndReopen({"one", "two", "three", "four"}); + + for (size_t i = 0; i < handles_.size(); ++i) { + PutRandomData(static_cast(i), 10, 100); + } + int num_writable_file_start = env_->GetNumberOfNewWritableFileCalls(); + // this will trigger the flushes + for (int i = 0; i <= 4; ++i) { + ASSERT_OK(Flush(i)); + } + + for (int i = 0; i < 4; ++i) { + WaitForFlush(i); + } + int total_new_writable_files = + env_->GetNumberOfNewWritableFileCalls() - num_writable_file_start; + ASSERT_EQ(static_cast(total_new_writable_files), handles_.size() + 1); + Close(); +} +#endif // !ROCKSDB_LITE + +#ifndef ROCKSDB_LITE // WaitForCompaction() is not supported in lite +TEST_P(ColumnFamilyTest, FlushStaleColumnFamilies) { + Open(); + CreateColumnFamilies({"one", "two"}); + ColumnFamilyOptions default_cf, one, two; + default_cf.write_buffer_size = 100000; // small write buffer size + default_cf.arena_block_size = 4096; + default_cf.disable_auto_compactions = true; + one.disable_auto_compactions = true; + two.disable_auto_compactions = true; + db_options_.max_total_wal_size = 210000; + + Reopen({default_cf, one, two}); + + PutRandomData(2, 1, 10); // 10 bytes + for (int i = 0; i < 2; ++i) { + PutRandomData(0, 100, 1000); // flush + WaitForFlush(0); + + AssertCountLiveFiles(i + 1); + } + // third flush. now, CF [two] should be detected as stale and flushed + // column family 1 should not be flushed since it's empty + PutRandomData(0, 100, 1000); // flush + WaitForFlush(0); + WaitForFlush(2); + // 3 files for default column families, 1 file for column family [two], zero + // files for column family [one], because it's empty + AssertCountLiveFiles(4); + + Flush(0); + ASSERT_EQ(0, dbfull()->TEST_total_log_size()); + Close(); +} +#endif // !ROCKSDB_LITE + +TEST_P(ColumnFamilyTest, CreateMissingColumnFamilies) { + Status s = TryOpen({"one", "two"}); + ASSERT_TRUE(!s.ok()); + db_options_.create_missing_column_families = true; + s = TryOpen({"default", "one", "two"}); + ASSERT_TRUE(s.ok()); + Close(); +} + +TEST_P(ColumnFamilyTest, SanitizeOptions) { + DBOptions db_options; + for (int s = kCompactionStyleLevel; s <= kCompactionStyleUniversal; ++s) { + for (int l = 0; l <= 2; l++) { + for (int i = 1; i <= 3; i++) { + for (int j = 1; j <= 3; j++) { + for (int k = 1; k <= 3; k++) { + ColumnFamilyOptions original; + original.compaction_style = static_cast(s); + original.num_levels = l; + original.level0_stop_writes_trigger = i; + original.level0_slowdown_writes_trigger = j; + original.level0_file_num_compaction_trigger = k; + original.write_buffer_size = + l * 4 * 1024 * 1024 + i * 1024 * 1024 + j * 1024 + k; + + ColumnFamilyOptions result = + SanitizeOptions(ImmutableDBOptions(db_options), original); + ASSERT_TRUE(result.level0_stop_writes_trigger >= + result.level0_slowdown_writes_trigger); + ASSERT_TRUE(result.level0_slowdown_writes_trigger >= + result.level0_file_num_compaction_trigger); + ASSERT_TRUE(result.level0_file_num_compaction_trigger == + original.level0_file_num_compaction_trigger); + if (s == kCompactionStyleLevel) { + ASSERT_GE(result.num_levels, 2); + } else { + ASSERT_GE(result.num_levels, 1); + if (original.num_levels >= 1) { + ASSERT_EQ(result.num_levels, original.num_levels); + } + } + + // Make sure Sanitize options sets arena_block_size to 1/8 of + // the write_buffer_size, rounded up to a multiple of 4k. + size_t expected_arena_block_size = + l * 4 * 1024 * 1024 / 8 + i * 1024 * 1024 / 8; + if (j + k != 0) { + // not a multiple of 4k, round up 4k + expected_arena_block_size += 4 * 1024; + } + ASSERT_EQ(expected_arena_block_size, result.arena_block_size); + } + } + } + } + } +} + +TEST_P(ColumnFamilyTest, ReadDroppedColumnFamily) { + // iter 0 -- drop CF, don't reopen + // iter 1 -- delete CF, reopen + for (int iter = 0; iter < 2; ++iter) { + db_options_.create_missing_column_families = true; + db_options_.max_open_files = 20; + // delete obsolete files always + db_options_.delete_obsolete_files_period_micros = 0; + Open({"default", "one", "two"}); + ColumnFamilyOptions options; + options.level0_file_num_compaction_trigger = 100; + options.level0_slowdown_writes_trigger = 200; + options.level0_stop_writes_trigger = 200; + options.write_buffer_size = 100000; // small write buffer size + Reopen({options, options, options}); + + // 1MB should create ~10 files for each CF + int kKeysNum = 10000; + PutRandomData(0, kKeysNum, 100); + PutRandomData(1, kKeysNum, 100); + PutRandomData(2, kKeysNum, 100); + + { + std::unique_ptr iterator( + db_->NewIterator(ReadOptions(), handles_[2])); + iterator->SeekToFirst(); + + if (iter == 0) { + // Drop CF two + ASSERT_OK(db_->DropColumnFamily(handles_[2])); + } else { + // delete CF two + db_->DestroyColumnFamilyHandle(handles_[2]); + handles_[2] = nullptr; + } + // Make sure iterator created can still be used. + int count = 0; + for (; iterator->Valid(); iterator->Next()) { + ASSERT_OK(iterator->status()); + ++count; + } + ASSERT_OK(iterator->status()); + ASSERT_EQ(count, kKeysNum); + } + + // Add bunch more data to other CFs + PutRandomData(0, kKeysNum, 100); + PutRandomData(1, kKeysNum, 100); + + if (iter == 1) { + Reopen(); + } + + // Since we didn't delete CF handle, RocksDB's contract guarantees that + // we're still able to read dropped CF + for (int i = 0; i < 3; ++i) { + std::unique_ptr iterator( + db_->NewIterator(ReadOptions(), handles_[i])); + int count = 0; + for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) { + ASSERT_OK(iterator->status()); + ++count; + } + ASSERT_OK(iterator->status()); + ASSERT_EQ(count, kKeysNum * ((i == 2) ? 1 : 2)); + } + + Close(); + Destroy(); + } +} + +TEST_P(ColumnFamilyTest, LiveIteratorWithDroppedColumnFamily) { + db_options_.create_missing_column_families = true; + db_options_.max_open_files = 20; + // delete obsolete files always + db_options_.delete_obsolete_files_period_micros = 0; + Open({"default", "one", "two"}); + ColumnFamilyOptions options; + options.level0_file_num_compaction_trigger = 100; + options.level0_slowdown_writes_trigger = 200; + options.level0_stop_writes_trigger = 200; + options.write_buffer_size = 100000; // small write buffer size + Reopen({options, options, options}); + + // 1MB should create ~10 files for each CF + int kKeysNum = 10000; + PutRandomData(1, kKeysNum, 100); + + { + std::unique_ptr iterator( + db_->NewIterator(ReadOptions(), handles_[1])); + iterator->SeekToFirst(); + + DropColumnFamilies({1}); + + // Make sure iterator created can still be used. + int count = 0; + for (; iterator->Valid(); iterator->Next()) { + ASSERT_OK(iterator->status()); + ++count; + } + ASSERT_OK(iterator->status()); + ASSERT_EQ(count, kKeysNum); + } + + Reopen(); + Close(); + Destroy(); +} + +TEST_P(ColumnFamilyTest, FlushAndDropRaceCondition) { + db_options_.create_missing_column_families = true; + Open({"default", "one"}); + ColumnFamilyOptions options; + options.level0_file_num_compaction_trigger = 100; + options.level0_slowdown_writes_trigger = 200; + options.level0_stop_writes_trigger = 200; + options.max_write_buffer_number = 20; + options.write_buffer_size = 100000; // small write buffer size + Reopen({options, options}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"VersionSet::LogAndApply::ColumnFamilyDrop:0", + "FlushJob::WriteLevel0Table"}, + {"VersionSet::LogAndApply::ColumnFamilyDrop:1", + "FlushJob::InstallResults"}, + {"FlushJob::InstallResults", + "VersionSet::LogAndApply::ColumnFamilyDrop:2"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + test::SleepingBackgroundTask sleeping_task; + + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task, + Env::Priority::HIGH); + + // 1MB should create ~10 files for each CF + int kKeysNum = 10000; + PutRandomData(1, kKeysNum, 100); + + std::vector threads; + threads.emplace_back([&] { ASSERT_OK(db_->DropColumnFamily(handles_[1])); }); + + sleeping_task.WakeUp(); + sleeping_task.WaitUntilDone(); + sleeping_task.Reset(); + // now we sleep again. this is just so we're certain that flush job finished + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task, + Env::Priority::HIGH); + sleeping_task.WakeUp(); + sleeping_task.WaitUntilDone(); + + { + // Since we didn't delete CF handle, RocksDB's contract guarantees that + // we're still able to read dropped CF + std::unique_ptr iterator( + db_->NewIterator(ReadOptions(), handles_[1])); + int count = 0; + for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) { + ASSERT_OK(iterator->status()); + ++count; + } + ASSERT_OK(iterator->status()); + ASSERT_EQ(count, kKeysNum); + } + for (auto& t : threads) { + t.join(); + } + + Close(); + Destroy(); +} + +#ifndef ROCKSDB_LITE +// skipped as persisting options is not supported in ROCKSDB_LITE +namespace { +std::atomic test_stage(0); +std::atomic ordered_by_writethread(false); +const int kMainThreadStartPersistingOptionsFile = 1; +const int kChildThreadFinishDroppingColumnFamily = 2; +void DropSingleColumnFamily(ColumnFamilyTest* cf_test, int cf_id, + std::vector* comparators) { + while (test_stage < kMainThreadStartPersistingOptionsFile && + !ordered_by_writethread) { + Env::Default()->SleepForMicroseconds(100); + } + cf_test->DropColumnFamilies({cf_id}); + if ((*comparators)[cf_id]) { + delete (*comparators)[cf_id]; + (*comparators)[cf_id] = nullptr; + } + test_stage = kChildThreadFinishDroppingColumnFamily; +} +} // namespace + +TEST_P(ColumnFamilyTest, CreateAndDropRace) { + const int kCfCount = 5; + std::vector cf_opts; + std::vector comparators; + for (int i = 0; i < kCfCount; ++i) { + cf_opts.emplace_back(); + comparators.push_back(new test::SimpleSuffixReverseComparator()); + cf_opts.back().comparator = comparators.back(); + } + db_options_.create_if_missing = true; + db_options_.create_missing_column_families = true; + + auto main_thread_id = std::this_thread::get_id(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "PersistRocksDBOptions:start", [&](void* /*arg*/) { + auto current_thread_id = std::this_thread::get_id(); + // If it's the main thread hitting this sync-point, then it + // will be blocked until some other thread update the test_stage. + if (main_thread_id == current_thread_id) { + test_stage = kMainThreadStartPersistingOptionsFile; + while (test_stage < kChildThreadFinishDroppingColumnFamily && + !ordered_by_writethread) { + Env::Default()->SleepForMicroseconds(100); + } + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::EnterUnbatched:Wait", [&](void* /*arg*/) { + // This means a thread doing DropColumnFamily() is waiting for + // other thread to finish persisting options. + // In such case, we update the test_stage to unblock the main thread. + ordered_by_writethread = true; + }); + + // Create a database with four column families + Open({"default", "one", "two", "three"}, + {cf_opts[0], cf_opts[1], cf_opts[2], cf_opts[3]}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Start a thread that will drop the first column family + // and its comparator + ROCKSDB_NAMESPACE::port::Thread drop_cf_thread(DropSingleColumnFamily, this, + 1, &comparators); + + DropColumnFamilies({2}); + + drop_cf_thread.join(); + Close(); + Destroy(); + for (auto* comparator : comparators) { + if (comparator) { + delete comparator; + } + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} +#endif // !ROCKSDB_LITE + +TEST_P(ColumnFamilyTest, WriteStallSingleColumnFamily) { + const uint64_t kBaseRate = 800000u; + db_options_.delayed_write_rate = kBaseRate; + db_options_.max_background_compactions = 6; + + Open({"default"}); + ColumnFamilyData* cfd = + static_cast(db_->DefaultColumnFamily())->cfd(); + + VersionStorageInfo* vstorage = cfd->current()->storage_info(); + + MutableCFOptions mutable_cf_options(column_family_options_); + + mutable_cf_options.level0_slowdown_writes_trigger = 20; + mutable_cf_options.level0_stop_writes_trigger = 10000; + mutable_cf_options.soft_pending_compaction_bytes_limit = 200; + mutable_cf_options.hard_pending_compaction_bytes_limit = 2000; + mutable_cf_options.disable_auto_compactions = false; + + vstorage->TEST_set_estimated_compaction_needed_bytes(50); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(201); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate()); + ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(400); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate()); + ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(500); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(450); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(205); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(202); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(201); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(198); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(399); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(599); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(2001); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(IsDbWriteStopped()); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(3001); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(IsDbWriteStopped()); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(390); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(100); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + + vstorage->set_l0_delay_trigger_count(100); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate()); + ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage->set_l0_delay_trigger_count(101); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate()); + + vstorage->set_l0_delay_trigger_count(0); + vstorage->TEST_set_estimated_compaction_needed_bytes(300); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate()); + + vstorage->set_l0_delay_trigger_count(101); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.25 / 1.25 / 1.25, GetDbDelayedWriteRate()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(200); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate()); + + vstorage->set_l0_delay_trigger_count(0); + vstorage->TEST_set_estimated_compaction_needed_bytes(0); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + + mutable_cf_options.disable_auto_compactions = true; + dbfull()->TEST_write_controler().set_delayed_write_rate(kBaseRate); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + + vstorage->set_l0_delay_trigger_count(50); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(0, GetDbDelayedWriteRate()); + ASSERT_EQ(kBaseRate, dbfull()->TEST_write_controler().delayed_write_rate()); + + vstorage->set_l0_delay_trigger_count(60); + vstorage->TEST_set_estimated_compaction_needed_bytes(300); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(0, GetDbDelayedWriteRate()); + ASSERT_EQ(kBaseRate, dbfull()->TEST_write_controler().delayed_write_rate()); + + mutable_cf_options.disable_auto_compactions = false; + vstorage->set_l0_delay_trigger_count(70); + vstorage->TEST_set_estimated_compaction_needed_bytes(500); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate()); + + vstorage->set_l0_delay_trigger_count(71); + vstorage->TEST_set_estimated_compaction_needed_bytes(501); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate()); +} + +TEST_P(ColumnFamilyTest, CompactionSpeedupSingleColumnFamily) { + db_options_.max_background_compactions = 6; + Open({"default"}); + ColumnFamilyData* cfd = + static_cast(db_->DefaultColumnFamily())->cfd(); + + VersionStorageInfo* vstorage = cfd->current()->storage_info(); + + MutableCFOptions mutable_cf_options(column_family_options_); + + // Speed up threshold = min(4 * 2, 4 + (36 - 4)/4) = 8 + mutable_cf_options.level0_file_num_compaction_trigger = 4; + mutable_cf_options.level0_slowdown_writes_trigger = 36; + mutable_cf_options.level0_stop_writes_trigger = 50; + // Speedup threshold = 200 / 4 = 50 + mutable_cf_options.soft_pending_compaction_bytes_limit = 200; + mutable_cf_options.hard_pending_compaction_bytes_limit = 2000; + + vstorage->TEST_set_estimated_compaction_needed_bytes(40); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(50); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(300); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(45); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage->set_l0_delay_trigger_count(7); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage->set_l0_delay_trigger_count(9); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage->set_l0_delay_trigger_count(6); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); + + // Speed up threshold = min(4 * 2, 4 + (12 - 4)/4) = 6 + mutable_cf_options.level0_file_num_compaction_trigger = 4; + mutable_cf_options.level0_slowdown_writes_trigger = 16; + mutable_cf_options.level0_stop_writes_trigger = 30; + + vstorage->set_l0_delay_trigger_count(5); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage->set_l0_delay_trigger_count(7); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage->set_l0_delay_trigger_count(3); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); +} + +TEST_P(ColumnFamilyTest, WriteStallTwoColumnFamilies) { + const uint64_t kBaseRate = 810000u; + db_options_.delayed_write_rate = kBaseRate; + Open(); + CreateColumnFamilies({"one"}); + ColumnFamilyData* cfd = + static_cast(db_->DefaultColumnFamily())->cfd(); + VersionStorageInfo* vstorage = cfd->current()->storage_info(); + + ColumnFamilyData* cfd1 = + static_cast(handles_[1])->cfd(); + VersionStorageInfo* vstorage1 = cfd1->current()->storage_info(); + + MutableCFOptions mutable_cf_options(column_family_options_); + mutable_cf_options.level0_slowdown_writes_trigger = 20; + mutable_cf_options.level0_stop_writes_trigger = 10000; + mutable_cf_options.soft_pending_compaction_bytes_limit = 200; + mutable_cf_options.hard_pending_compaction_bytes_limit = 2000; + + MutableCFOptions mutable_cf_options1 = mutable_cf_options; + mutable_cf_options1.soft_pending_compaction_bytes_limit = 500; + + vstorage->TEST_set_estimated_compaction_needed_bytes(50); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + + vstorage1->TEST_set_estimated_compaction_needed_bytes(201); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + + vstorage1->TEST_set_estimated_compaction_needed_bytes(600); + RecalculateWriteStallConditions(cfd1, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(70); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate()); + + vstorage1->TEST_set_estimated_compaction_needed_bytes(800); + RecalculateWriteStallConditions(cfd1, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(300); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate()); + + vstorage1->TEST_set_estimated_compaction_needed_bytes(700); + RecalculateWriteStallConditions(cfd1, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(500); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate()); + + vstorage1->TEST_set_estimated_compaction_needed_bytes(600); + RecalculateWriteStallConditions(cfd1, mutable_cf_options); + ASSERT_TRUE(!IsDbWriteStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate()); +} + +TEST_P(ColumnFamilyTest, CompactionSpeedupTwoColumnFamilies) { + db_options_.max_background_compactions = 6; + column_family_options_.soft_pending_compaction_bytes_limit = 200; + column_family_options_.hard_pending_compaction_bytes_limit = 2000; + Open(); + CreateColumnFamilies({"one"}); + ColumnFamilyData* cfd = + static_cast(db_->DefaultColumnFamily())->cfd(); + VersionStorageInfo* vstorage = cfd->current()->storage_info(); + + ColumnFamilyData* cfd1 = + static_cast(handles_[1])->cfd(); + VersionStorageInfo* vstorage1 = cfd1->current()->storage_info(); + + MutableCFOptions mutable_cf_options(column_family_options_); + // Speed up threshold = min(4 * 2, 4 + (36 - 4)/4) = 8 + mutable_cf_options.level0_file_num_compaction_trigger = 4; + mutable_cf_options.level0_slowdown_writes_trigger = 36; + mutable_cf_options.level0_stop_writes_trigger = 30; + // Speedup threshold = 200 / 4 = 50 + mutable_cf_options.soft_pending_compaction_bytes_limit = 200; + mutable_cf_options.hard_pending_compaction_bytes_limit = 2000; + + MutableCFOptions mutable_cf_options1 = mutable_cf_options; + mutable_cf_options1.level0_slowdown_writes_trigger = 16; + + vstorage->TEST_set_estimated_compaction_needed_bytes(40); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(60); + RecalculateWriteStallConditions(cfd1, mutable_cf_options); + ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage1->TEST_set_estimated_compaction_needed_bytes(30); + RecalculateWriteStallConditions(cfd1, mutable_cf_options); + ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage1->TEST_set_estimated_compaction_needed_bytes(70); + RecalculateWriteStallConditions(cfd1, mutable_cf_options); + ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(20); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage1->TEST_set_estimated_compaction_needed_bytes(3); + RecalculateWriteStallConditions(cfd1, mutable_cf_options); + ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage->set_l0_delay_trigger_count(9); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage1->set_l0_delay_trigger_count(2); + RecalculateWriteStallConditions(cfd1, mutable_cf_options); + ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage->set_l0_delay_trigger_count(0); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); +} + +TEST_P(ColumnFamilyTest, CreateAndDestoryOptions) { + std::unique_ptr cfo(new ColumnFamilyOptions()); + ColumnFamilyHandle* cfh; + Open(); + ASSERT_OK(db_->CreateColumnFamily(*(cfo.get()), "yoyo", &cfh)); + cfo.reset(); + ASSERT_OK(db_->Put(WriteOptions(), cfh, "foo", "bar")); + ASSERT_OK(db_->Flush(FlushOptions(), cfh)); + ASSERT_OK(db_->DropColumnFamily(cfh)); + ASSERT_OK(db_->DestroyColumnFamilyHandle(cfh)); +} + +TEST_P(ColumnFamilyTest, CreateDropAndDestroy) { + ColumnFamilyHandle* cfh; + Open(); + ASSERT_OK(db_->CreateColumnFamily(ColumnFamilyOptions(), "yoyo", &cfh)); + ASSERT_OK(db_->Put(WriteOptions(), cfh, "foo", "bar")); + ASSERT_OK(db_->Flush(FlushOptions(), cfh)); + ASSERT_OK(db_->DropColumnFamily(cfh)); + ASSERT_OK(db_->DestroyColumnFamilyHandle(cfh)); +} + +#ifndef ROCKSDB_LITE +TEST_P(ColumnFamilyTest, CreateDropAndDestroyWithoutFileDeletion) { + ColumnFamilyHandle* cfh; + Open(); + ASSERT_OK(db_->CreateColumnFamily(ColumnFamilyOptions(), "yoyo", &cfh)); + ASSERT_OK(db_->Put(WriteOptions(), cfh, "foo", "bar")); + ASSERT_OK(db_->Flush(FlushOptions(), cfh)); + ASSERT_OK(db_->DisableFileDeletions()); + ASSERT_OK(db_->DropColumnFamily(cfh)); + ASSERT_OK(db_->DestroyColumnFamilyHandle(cfh)); +} + +TEST_P(ColumnFamilyTest, FlushCloseWALFiles) { + SpecialEnv env(Env::Default()); + db_options_.env = &env; + db_options_.max_background_flushes = 1; + column_family_options_.memtable_factory.reset(new SpecialSkipListFactory(2)); + Open(); + CreateColumnFamilies({"one"}); + ASSERT_OK(Put(1, "fodor", "mirko")); + ASSERT_OK(Put(0, "fodor", "mirko")); + ASSERT_OK(Put(1, "fodor", "mirko")); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"DBImpl::BGWorkFlush:done", "FlushCloseWALFiles:0"}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Block flush jobs from running + test::SleepingBackgroundTask sleeping_task; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task, + Env::Priority::HIGH); + + WriteOptions wo; + wo.sync = true; + ASSERT_OK(db_->Put(wo, handles_[1], "fodor", "mirko")); + + ASSERT_EQ(2, env.num_open_wal_file_.load()); + + sleeping_task.WakeUp(); + sleeping_task.WaitUntilDone(); + TEST_SYNC_POINT("FlushCloseWALFiles:0"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ASSERT_EQ(1, env.num_open_wal_file_.load()); + + Reopen(); + ASSERT_EQ("mirko", Get(0, "fodor")); + ASSERT_EQ("mirko", Get(1, "fodor")); + db_options_.env = env_; + Close(); +} +#endif // !ROCKSDB_LITE + +#ifndef ROCKSDB_LITE // WaitForFlush() is not supported +TEST_P(ColumnFamilyTest, IteratorCloseWALFile1) { + SpecialEnv env(Env::Default()); + db_options_.env = &env; + db_options_.max_background_flushes = 1; + column_family_options_.memtable_factory.reset(new SpecialSkipListFactory(2)); + Open(); + CreateColumnFamilies({"one"}); + ASSERT_OK(Put(1, "fodor", "mirko")); + // Create an iterator holding the current super version. + Iterator* it = db_->NewIterator(ReadOptions(), handles_[1]); + // A flush will make `it` hold the last reference of its super version. + Flush(1); + + ASSERT_OK(Put(1, "fodor", "mirko")); + ASSERT_OK(Put(0, "fodor", "mirko")); + ASSERT_OK(Put(1, "fodor", "mirko")); + + // Flush jobs will close previous WAL files after finishing. By + // block flush jobs from running, we trigger a condition where + // the iterator destructor should close the WAL files. + test::SleepingBackgroundTask sleeping_task; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task, + Env::Priority::HIGH); + + WriteOptions wo; + wo.sync = true; + ASSERT_OK(db_->Put(wo, handles_[1], "fodor", "mirko")); + + ASSERT_EQ(2, env.num_open_wal_file_.load()); + // Deleting the iterator will clear its super version, triggering + // closing all files + delete it; + ASSERT_EQ(1, env.num_open_wal_file_.load()); + + sleeping_task.WakeUp(); + sleeping_task.WaitUntilDone(); + WaitForFlush(1); + + Reopen(); + ASSERT_EQ("mirko", Get(0, "fodor")); + ASSERT_EQ("mirko", Get(1, "fodor")); + db_options_.env = env_; + Close(); +} + +TEST_P(ColumnFamilyTest, IteratorCloseWALFile2) { + SpecialEnv env(Env::Default()); + // Allow both of flush and purge job to schedule. + env.SetBackgroundThreads(2, Env::HIGH); + db_options_.env = &env; + db_options_.max_background_flushes = 1; + column_family_options_.memtable_factory.reset(new SpecialSkipListFactory(2)); + Open(); + CreateColumnFamilies({"one"}); + ASSERT_OK(Put(1, "fodor", "mirko")); + // Create an iterator holding the current super version. + ReadOptions ro; + ro.background_purge_on_iterator_cleanup = true; + Iterator* it = db_->NewIterator(ro, handles_[1]); + // A flush will make `it` hold the last reference of its super version. + Flush(1); + + ASSERT_OK(Put(1, "fodor", "mirko")); + ASSERT_OK(Put(0, "fodor", "mirko")); + ASSERT_OK(Put(1, "fodor", "mirko")); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"ColumnFamilyTest::IteratorCloseWALFile2:0", + "DBImpl::BGWorkPurge:start"}, + {"ColumnFamilyTest::IteratorCloseWALFile2:2", + "DBImpl::BackgroundCallFlush:start"}, + {"DBImpl::BGWorkPurge:end", "ColumnFamilyTest::IteratorCloseWALFile2:1"}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + WriteOptions wo; + wo.sync = true; + ASSERT_OK(db_->Put(wo, handles_[1], "fodor", "mirko")); + + ASSERT_EQ(2, env.num_open_wal_file_.load()); + // Deleting the iterator will clear its super version, triggering + // closing all files + delete it; + ASSERT_EQ(2, env.num_open_wal_file_.load()); + + TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:0"); + TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:1"); + ASSERT_EQ(1, env.num_open_wal_file_.load()); + TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:2"); + WaitForFlush(1); + ASSERT_EQ(1, env.num_open_wal_file_.load()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + Reopen(); + ASSERT_EQ("mirko", Get(0, "fodor")); + ASSERT_EQ("mirko", Get(1, "fodor")); + db_options_.env = env_; + Close(); +} +#endif // !ROCKSDB_LITE + +#ifndef ROCKSDB_LITE // TEST functions are not supported in lite +TEST_P(ColumnFamilyTest, ForwardIteratorCloseWALFile) { + SpecialEnv env(Env::Default()); + // Allow both of flush and purge job to schedule. + env.SetBackgroundThreads(2, Env::HIGH); + db_options_.env = &env; + db_options_.max_background_flushes = 1; + column_family_options_.memtable_factory.reset(new SpecialSkipListFactory(3)); + column_family_options_.level0_file_num_compaction_trigger = 2; + Open(); + CreateColumnFamilies({"one"}); + ASSERT_OK(Put(1, "fodor", "mirko")); + ASSERT_OK(Put(1, "fodar2", "mirko")); + Flush(1); + + // Create an iterator holding the current super version, as well as + // the SST file just flushed. + ReadOptions ro; + ro.tailing = true; + ro.background_purge_on_iterator_cleanup = true; + Iterator* it = db_->NewIterator(ro, handles_[1]); + // A flush will make `it` hold the last reference of its super version. + + ASSERT_OK(Put(1, "fodor", "mirko")); + ASSERT_OK(Put(1, "fodar2", "mirko")); + Flush(1); + + WaitForCompaction(); + + ASSERT_OK(Put(1, "fodor", "mirko")); + ASSERT_OK(Put(1, "fodor", "mirko")); + ASSERT_OK(Put(0, "fodor", "mirko")); + ASSERT_OK(Put(1, "fodor", "mirko")); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"ColumnFamilyTest::IteratorCloseWALFile2:0", + "DBImpl::BGWorkPurge:start"}, + {"ColumnFamilyTest::IteratorCloseWALFile2:2", + "DBImpl::BackgroundCallFlush:start"}, + {"DBImpl::BGWorkPurge:end", "ColumnFamilyTest::IteratorCloseWALFile2:1"}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + WriteOptions wo; + wo.sync = true; + ASSERT_OK(db_->Put(wo, handles_[1], "fodor", "mirko")); + + env.delete_count_.store(0); + ASSERT_EQ(2, env.num_open_wal_file_.load()); + // Deleting the iterator will clear its super version, triggering + // closing all files + it->Seek(""); + ASSERT_EQ(2, env.num_open_wal_file_.load()); + ASSERT_EQ(0, env.delete_count_.load()); + + TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:0"); + TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:1"); + ASSERT_EQ(1, env.num_open_wal_file_.load()); + ASSERT_EQ(1, env.delete_count_.load()); + TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:2"); + WaitForFlush(1); + ASSERT_EQ(1, env.num_open_wal_file_.load()); + ASSERT_EQ(1, env.delete_count_.load()); + + delete it; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + Reopen(); + ASSERT_EQ("mirko", Get(0, "fodor")); + ASSERT_EQ("mirko", Get(1, "fodor")); + db_options_.env = env_; + Close(); +} +#endif // !ROCKSDB_LITE + +// Disable on windows because SyncWAL requires env->IsSyncThreadSafe() +// to return true which is not so in unbuffered mode. +#ifndef OS_WIN +TEST_P(ColumnFamilyTest, LogSyncConflictFlush) { + Open(); + CreateColumnFamiliesAndReopen({"one", "two"}); + + Put(0, "", ""); + Put(1, "foo", "bar"); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::SyncWAL:BeforeMarkLogsSynced:1", + "ColumnFamilyTest::LogSyncConflictFlush:1"}, + {"ColumnFamilyTest::LogSyncConflictFlush:2", + "DBImpl::SyncWAL:BeforeMarkLogsSynced:2"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::port::Thread thread([&] { db_->SyncWAL(); }); + + TEST_SYNC_POINT("ColumnFamilyTest::LogSyncConflictFlush:1"); + Flush(1); + Put(1, "foo", "bar"); + Flush(1); + + TEST_SYNC_POINT("ColumnFamilyTest::LogSyncConflictFlush:2"); + + thread.join(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + Close(); +} +#endif + +// this test is placed here, because the infrastructure for Column Family +// test is being used to ensure a roll of wal files. +// Basic idea is to test that WAL truncation is being detected and not +// ignored +TEST_P(ColumnFamilyTest, DISABLED_LogTruncationTest) { + Open(); + CreateColumnFamiliesAndReopen({"one", "two"}); + + Build(0, 100); + + // Flush the 0th column family to force a roll of the wal log + Flush(0); + + // Add some more entries + Build(100, 100); + + std::vector filenames; + ASSERT_OK(env_->GetChildren(dbname_, &filenames)); + + // collect wal files + std::vector logfs; + for (size_t i = 0; i < filenames.size(); i++) { + uint64_t number; + FileType type; + if (!(ParseFileName(filenames[i], &number, &type))) continue; + + if (type != kLogFile) continue; + + logfs.push_back(filenames[i]); + } + + std::sort(logfs.begin(), logfs.end()); + ASSERT_GE(logfs.size(), 2); + + // Take the last but one file, and truncate it + std::string fpath = dbname_ + "/" + logfs[logfs.size() - 2]; + std::vector names_save = names_; + + uint64_t fsize; + ASSERT_OK(env_->GetFileSize(fpath, &fsize)); + ASSERT_GT(fsize, 0); + + Close(); + + std::string backup_logs = dbname_ + "/backup_logs"; + std::string t_fpath = backup_logs + "/" + logfs[logfs.size() - 2]; + + ASSERT_OK(env_->CreateDirIfMissing(backup_logs)); + // Not sure how easy it is to make this data driven. + // need to read back the WAL file and truncate last 10 + // entries + CopyFile(fpath, t_fpath, fsize - 9180); + + ASSERT_OK(env_->DeleteFile(fpath)); + ASSERT_OK(env_->RenameFile(t_fpath, fpath)); + + db_options_.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + + OpenReadOnly(names_save); + + CheckMissed(); + + Close(); + + Open(names_save); + + CheckMissed(); + + Close(); + + // cleanup + env_->DeleteDir(backup_logs); +} + +TEST_P(ColumnFamilyTest, DefaultCfPathsTest) { + Open(); + // Leave cf_paths for one column families to be empty. + // Files should be generated according to db_paths for that + // column family. + ColumnFamilyOptions cf_opt1, cf_opt2; + cf_opt1.cf_paths.emplace_back(dbname_ + "_one_1", + std::numeric_limits::max()); + CreateColumnFamilies({"one", "two"}, {cf_opt1, cf_opt2}); + Reopen({ColumnFamilyOptions(), cf_opt1, cf_opt2}); + + // Fill Column family 1. + PutRandomData(1, 100, 100); + Flush(1); + + ASSERT_EQ(1, GetSstFileCount(cf_opt1.cf_paths[0].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + // Fill column family 2 + PutRandomData(2, 100, 100); + Flush(2); + + // SST from Column family 2 should be generated in + // db_paths which is dbname_ in this case. + ASSERT_EQ(1, GetSstFileCount(dbname_)); +} + +TEST_P(ColumnFamilyTest, MultipleCFPathsTest) { + Open(); + // Configure Column family specific paths. + ColumnFamilyOptions cf_opt1, cf_opt2; + cf_opt1.cf_paths.emplace_back(dbname_ + "_one_1", + std::numeric_limits::max()); + cf_opt2.cf_paths.emplace_back(dbname_ + "_two_1", + std::numeric_limits::max()); + CreateColumnFamilies({"one", "two"}, {cf_opt1, cf_opt2}); + Reopen({ColumnFamilyOptions(), cf_opt1, cf_opt2}); + + PutRandomData(1, 100, 100, true /* save */); + Flush(1); + + // Check that files are generated in appropriate paths. + ASSERT_EQ(1, GetSstFileCount(cf_opt1.cf_paths[0].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + PutRandomData(2, 100, 100, true /* save */); + Flush(2); + + ASSERT_EQ(1, GetSstFileCount(cf_opt2.cf_paths[0].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + // Re-open and verify the keys. + Reopen({ColumnFamilyOptions(), cf_opt1, cf_opt2}); + DBImpl* dbi = reinterpret_cast(db_); + for (int cf = 1; cf != 3; ++cf) { + ReadOptions read_options; + read_options.readahead_size = 0; + auto it = dbi->NewIterator(read_options, handles_[cf]); + for (it->SeekToFirst(); it->Valid(); it->Next()) { + Slice key(it->key()); + ASSERT_NE(keys_[cf].end(), keys_[cf].find(key.ToString())); + } + delete it; + + for (const auto& key : keys_[cf]) { + ASSERT_NE("NOT_FOUND", Get(cf, key)); + } + } +} + +} // namespace ROCKSDB_NAMESPACE + +#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS +extern "C" { +void RegisterCustomObjects(int argc, char** argv); +} +#else +void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {} +#endif // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/compact_files_test.cc b/src/rocksdb/db/compact_files_test.cc new file mode 100644 index 000000000..948ada675 --- /dev/null +++ b/src/rocksdb/db/compact_files_test.cc @@ -0,0 +1,421 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +class CompactFilesTest : public testing::Test { + public: + CompactFilesTest() { + env_ = Env::Default(); + db_name_ = test::PerThreadDBPath("compact_files_test"); + } + + std::string db_name_; + Env* env_; +}; + +// A class which remembers the name of each flushed file. +class FlushedFileCollector : public EventListener { + public: + FlushedFileCollector() {} + ~FlushedFileCollector() override {} + + void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override { + std::lock_guard lock(mutex_); + flushed_files_.push_back(info.file_path); + } + + std::vector GetFlushedFiles() { + std::lock_guard lock(mutex_); + std::vector result; + for (auto fname : flushed_files_) { + result.push_back(fname); + } + return result; + } + void ClearFlushedFiles() { + std::lock_guard lock(mutex_); + flushed_files_.clear(); + } + + private: + std::vector flushed_files_; + std::mutex mutex_; +}; + +TEST_F(CompactFilesTest, L0ConflictsFiles) { + Options options; + // to trigger compaction more easily + const int kWriteBufferSize = 10000; + const int kLevel0Trigger = 2; + options.create_if_missing = true; + options.compaction_style = kCompactionStyleLevel; + // Small slowdown and stop trigger for experimental purpose. + options.level0_slowdown_writes_trigger = 20; + options.level0_stop_writes_trigger = 20; + options.level0_stop_writes_trigger = 20; + options.write_buffer_size = kWriteBufferSize; + options.level0_file_num_compaction_trigger = kLevel0Trigger; + options.compression = kNoCompression; + + DB* db = nullptr; + DestroyDB(db_name_, options); + Status s = DB::Open(options, db_name_, &db); + assert(s.ok()); + assert(db); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"CompactFilesImpl:0", "BackgroundCallCompaction:0"}, + {"BackgroundCallCompaction:1", "CompactFilesImpl:1"}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // create couple files + // Background compaction starts and waits in BackgroundCallCompaction:0 + for (int i = 0; i < kLevel0Trigger * 4; ++i) { + db->Put(WriteOptions(), ToString(i), ""); + db->Put(WriteOptions(), ToString(100 - i), ""); + db->Flush(FlushOptions()); + } + + ROCKSDB_NAMESPACE::ColumnFamilyMetaData meta; + db->GetColumnFamilyMetaData(&meta); + std::string file1; + for (auto& file : meta.levels[0].files) { + ASSERT_EQ(0, meta.levels[0].level); + if (file1 == "") { + file1 = file.db_path + "/" + file.name; + } else { + std::string file2 = file.db_path + "/" + file.name; + // Another thread starts a compact files and creates an L0 compaction + // The background compaction then notices that there is an L0 compaction + // already in progress and doesn't do an L0 compaction + // Once the background compaction finishes, the compact files finishes + ASSERT_OK(db->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), + {file1, file2}, 0)); + break; + } + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + delete db; +} + +TEST_F(CompactFilesTest, ObsoleteFiles) { + Options options; + // to trigger compaction more easily + const int kWriteBufferSize = 65536; + options.create_if_missing = true; + // Disable RocksDB background compaction. + options.compaction_style = kCompactionStyleNone; + options.level0_slowdown_writes_trigger = (1 << 30); + options.level0_stop_writes_trigger = (1 << 30); + options.write_buffer_size = kWriteBufferSize; + options.max_write_buffer_number = 2; + options.compression = kNoCompression; + + // Add listener + FlushedFileCollector* collector = new FlushedFileCollector(); + options.listeners.emplace_back(collector); + + DB* db = nullptr; + DestroyDB(db_name_, options); + Status s = DB::Open(options, db_name_, &db); + assert(s.ok()); + assert(db); + + // create couple files + for (int i = 1000; i < 2000; ++i) { + db->Put(WriteOptions(), ToString(i), + std::string(kWriteBufferSize / 10, 'a' + (i % 26))); + } + + auto l0_files = collector->GetFlushedFiles(); + ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files, 1)); + reinterpret_cast(db)->TEST_WaitForCompact(); + + // verify all compaction input files are deleted + for (auto fname : l0_files) { + ASSERT_EQ(Status::NotFound(), env_->FileExists(fname)); + } + delete db; +} + +TEST_F(CompactFilesTest, NotCutOutputOnLevel0) { + Options options; + options.create_if_missing = true; + // Disable RocksDB background compaction. + options.compaction_style = kCompactionStyleNone; + options.level0_slowdown_writes_trigger = 1000; + options.level0_stop_writes_trigger = 1000; + options.write_buffer_size = 65536; + options.max_write_buffer_number = 2; + options.compression = kNoCompression; + options.max_compaction_bytes = 5000; + + // Add listener + FlushedFileCollector* collector = new FlushedFileCollector(); + options.listeners.emplace_back(collector); + + DB* db = nullptr; + DestroyDB(db_name_, options); + Status s = DB::Open(options, db_name_, &db); + assert(s.ok()); + assert(db); + + // create couple files + for (int i = 0; i < 500; ++i) { + db->Put(WriteOptions(), ToString(i), std::string(1000, 'a' + (i % 26))); + } + reinterpret_cast(db)->TEST_WaitForFlushMemTable(); + auto l0_files_1 = collector->GetFlushedFiles(); + collector->ClearFlushedFiles(); + for (int i = 0; i < 500; ++i) { + db->Put(WriteOptions(), ToString(i), std::string(1000, 'a' + (i % 26))); + } + reinterpret_cast(db)->TEST_WaitForFlushMemTable(); + auto l0_files_2 = collector->GetFlushedFiles(); + ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files_1, 0)); + ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files_2, 0)); + // no assertion failure + delete db; +} + +TEST_F(CompactFilesTest, CapturingPendingFiles) { + Options options; + options.create_if_missing = true; + // Disable RocksDB background compaction. + options.compaction_style = kCompactionStyleNone; + // Always do full scans for obsolete files (needed to reproduce the issue). + options.delete_obsolete_files_period_micros = 0; + + // Add listener. + FlushedFileCollector* collector = new FlushedFileCollector(); + options.listeners.emplace_back(collector); + + DB* db = nullptr; + DestroyDB(db_name_, options); + Status s = DB::Open(options, db_name_, &db); + assert(s.ok()); + assert(db); + + // Create 5 files. + for (int i = 0; i < 5; ++i) { + db->Put(WriteOptions(), "key" + ToString(i), "value"); + db->Flush(FlushOptions()); + } + + auto l0_files = collector->GetFlushedFiles(); + EXPECT_EQ(5, l0_files.size()); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"CompactFilesImpl:2", "CompactFilesTest.CapturingPendingFiles:0"}, + {"CompactFilesTest.CapturingPendingFiles:1", "CompactFilesImpl:3"}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Start compacting files. + ROCKSDB_NAMESPACE::port::Thread compaction_thread( + [&] { EXPECT_OK(db->CompactFiles(CompactionOptions(), l0_files, 1)); }); + + // In the meantime flush another file. + TEST_SYNC_POINT("CompactFilesTest.CapturingPendingFiles:0"); + db->Put(WriteOptions(), "key5", "value"); + db->Flush(FlushOptions()); + TEST_SYNC_POINT("CompactFilesTest.CapturingPendingFiles:1"); + + compaction_thread.join(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + delete db; + + // Make sure we can reopen the DB. + s = DB::Open(options, db_name_, &db); + ASSERT_TRUE(s.ok()); + assert(db); + delete db; +} + +TEST_F(CompactFilesTest, CompactionFilterWithGetSv) { + class FilterWithGet : public CompactionFilter { + public: + bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/, + std::string* /*new_value*/, + bool* /*value_changed*/) const override { + if (db_ == nullptr) { + return true; + } + std::string res; + db_->Get(ReadOptions(), "", &res); + return true; + } + + void SetDB(DB* db) { + db_ = db; + } + + const char* Name() const override { return "FilterWithGet"; } + + private: + DB* db_; + }; + + + std::shared_ptr cf(new FilterWithGet()); + + Options options; + options.create_if_missing = true; + options.compaction_filter = cf.get(); + + DB* db = nullptr; + DestroyDB(db_name_, options); + Status s = DB::Open(options, db_name_, &db); + ASSERT_OK(s); + + cf->SetDB(db); + + // Write one L0 file + db->Put(WriteOptions(), "K1", "V1"); + db->Flush(FlushOptions()); + + // Compact all L0 files using CompactFiles + ROCKSDB_NAMESPACE::ColumnFamilyMetaData meta; + db->GetColumnFamilyMetaData(&meta); + for (auto& file : meta.levels[0].files) { + std::string fname = file.db_path + "/" + file.name; + ASSERT_OK( + db->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), {fname}, 0)); + } + + + delete db; +} + +TEST_F(CompactFilesTest, SentinelCompressionType) { + if (!Zlib_Supported()) { + fprintf(stderr, "zlib compression not supported, skip this test\n"); + return; + } + if (!Snappy_Supported()) { + fprintf(stderr, "snappy compression not supported, skip this test\n"); + return; + } + // Check that passing `CompressionType::kDisableCompressionOption` to + // `CompactFiles` causes it to use the column family compression options. + for (auto compaction_style : + {CompactionStyle::kCompactionStyleLevel, + CompactionStyle::kCompactionStyleUniversal, + CompactionStyle::kCompactionStyleNone}) { + DestroyDB(db_name_, Options()); + Options options; + options.compaction_style = compaction_style; + // L0: Snappy, L1: ZSTD, L2: Snappy + options.compression_per_level = {CompressionType::kSnappyCompression, + CompressionType::kZlibCompression, + CompressionType::kSnappyCompression}; + options.create_if_missing = true; + FlushedFileCollector* collector = new FlushedFileCollector(); + options.listeners.emplace_back(collector); + DB* db = nullptr; + ASSERT_OK(DB::Open(options, db_name_, &db)); + + db->Put(WriteOptions(), "key", "val"); + db->Flush(FlushOptions()); + + auto l0_files = collector->GetFlushedFiles(); + ASSERT_EQ(1, l0_files.size()); + + // L0->L1 compaction, so output should be ZSTD-compressed + CompactionOptions compaction_opts; + compaction_opts.compression = CompressionType::kDisableCompressionOption; + ASSERT_OK(db->CompactFiles(compaction_opts, l0_files, 1)); + + ROCKSDB_NAMESPACE::TablePropertiesCollection all_tables_props; + ASSERT_OK(db->GetPropertiesOfAllTables(&all_tables_props)); + for (const auto& name_and_table_props : all_tables_props) { + ASSERT_EQ(CompressionTypeToString(CompressionType::kZlibCompression), + name_and_table_props.second->compression_name); + } + delete db; + } +} + +TEST_F(CompactFilesTest, GetCompactionJobInfo) { + Options options; + options.create_if_missing = true; + // Disable RocksDB background compaction. + options.compaction_style = kCompactionStyleNone; + options.level0_slowdown_writes_trigger = 1000; + options.level0_stop_writes_trigger = 1000; + options.write_buffer_size = 65536; + options.max_write_buffer_number = 2; + options.compression = kNoCompression; + options.max_compaction_bytes = 5000; + + // Add listener + FlushedFileCollector* collector = new FlushedFileCollector(); + options.listeners.emplace_back(collector); + + DB* db = nullptr; + DestroyDB(db_name_, options); + Status s = DB::Open(options, db_name_, &db); + assert(s.ok()); + assert(db); + + // create couple files + for (int i = 0; i < 500; ++i) { + db->Put(WriteOptions(), ToString(i), std::string(1000, 'a' + (i % 26))); + } + reinterpret_cast(db)->TEST_WaitForFlushMemTable(); + auto l0_files_1 = collector->GetFlushedFiles(); + CompactionOptions co; + co.compression = CompressionType::kLZ4Compression; + CompactionJobInfo compaction_job_info{}; + ASSERT_OK( + db->CompactFiles(co, l0_files_1, 0, -1, nullptr, &compaction_job_info)); + ASSERT_EQ(compaction_job_info.base_input_level, 0); + ASSERT_EQ(compaction_job_info.cf_id, db->DefaultColumnFamily()->GetID()); + ASSERT_EQ(compaction_job_info.cf_name, db->DefaultColumnFamily()->GetName()); + ASSERT_EQ(compaction_job_info.compaction_reason, + CompactionReason::kManualCompaction); + ASSERT_EQ(compaction_job_info.compression, CompressionType::kLZ4Compression); + ASSERT_EQ(compaction_job_info.output_level, 0); + ASSERT_OK(compaction_job_info.status); + // no assertion failure + delete db; +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, + "SKIPPED as DBImpl::CompactFiles is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/compacted_db_impl.cc b/src/rocksdb/db/compacted_db_impl.cc new file mode 100644 index 000000000..47d6ecced --- /dev/null +++ b/src/rocksdb/db/compacted_db_impl.cc @@ -0,0 +1,160 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE +#include "db/compacted_db_impl.h" +#include "db/db_impl/db_impl.h" +#include "db/version_set.h" +#include "table/get_context.h" + +namespace ROCKSDB_NAMESPACE { + +extern void MarkKeyMayExist(void* arg); +extern bool SaveValue(void* arg, const ParsedInternalKey& parsed_key, + const Slice& v, bool hit_and_return); + +CompactedDBImpl::CompactedDBImpl( + const DBOptions& options, const std::string& dbname) + : DBImpl(options, dbname), cfd_(nullptr), version_(nullptr), + user_comparator_(nullptr) { +} + +CompactedDBImpl::~CompactedDBImpl() { +} + +size_t CompactedDBImpl::FindFile(const Slice& key) { + size_t right = files_.num_files - 1; + auto cmp = [&](const FdWithKeyRange& f, const Slice& k) -> bool { + return user_comparator_->Compare(ExtractUserKey(f.largest_key), k) < 0; + }; + return static_cast(std::lower_bound(files_.files, + files_.files + right, key, cmp) - files_.files); +} + +Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*, + const Slice& key, PinnableSlice* value) { + GetContext get_context(user_comparator_, nullptr, nullptr, nullptr, + GetContext::kNotFound, key, value, nullptr, nullptr, + true, nullptr, nullptr); + LookupKey lkey(key, kMaxSequenceNumber); + files_.files[FindFile(key)].fd.table_reader->Get(options, lkey.internal_key(), + &get_context, nullptr); + if (get_context.State() == GetContext::kFound) { + return Status::OK(); + } + return Status::NotFound(); +} + +std::vector CompactedDBImpl::MultiGet(const ReadOptions& options, + const std::vector&, + const std::vector& keys, std::vector* values) { + autovector reader_list; + for (const auto& key : keys) { + const FdWithKeyRange& f = files_.files[FindFile(key)]; + if (user_comparator_->Compare(key, ExtractUserKey(f.smallest_key)) < 0) { + reader_list.push_back(nullptr); + } else { + LookupKey lkey(key, kMaxSequenceNumber); + f.fd.table_reader->Prepare(lkey.internal_key()); + reader_list.push_back(f.fd.table_reader); + } + } + std::vector statuses(keys.size(), Status::NotFound()); + values->resize(keys.size()); + int idx = 0; + for (auto* r : reader_list) { + if (r != nullptr) { + PinnableSlice pinnable_val; + std::string& value = (*values)[idx]; + GetContext get_context(user_comparator_, nullptr, nullptr, nullptr, + GetContext::kNotFound, keys[idx], &pinnable_val, + nullptr, nullptr, true, nullptr, nullptr); + LookupKey lkey(keys[idx], kMaxSequenceNumber); + r->Get(options, lkey.internal_key(), &get_context, nullptr); + value.assign(pinnable_val.data(), pinnable_val.size()); + if (get_context.State() == GetContext::kFound) { + statuses[idx] = Status::OK(); + } + } + ++idx; + } + return statuses; +} + +Status CompactedDBImpl::Init(const Options& options) { + SuperVersionContext sv_context(/* create_superversion */ true); + mutex_.Lock(); + ColumnFamilyDescriptor cf(kDefaultColumnFamilyName, + ColumnFamilyOptions(options)); + Status s = Recover({cf}, true /* read only */, false, true); + if (s.ok()) { + cfd_ = reinterpret_cast( + DefaultColumnFamily())->cfd(); + cfd_->InstallSuperVersion(&sv_context, &mutex_); + } + mutex_.Unlock(); + sv_context.Clean(); + if (!s.ok()) { + return s; + } + NewThreadStatusCfInfo(cfd_); + version_ = cfd_->GetSuperVersion()->current; + user_comparator_ = cfd_->user_comparator(); + auto* vstorage = version_->storage_info(); + if (vstorage->num_non_empty_levels() == 0) { + return Status::NotSupported("no file exists"); + } + const LevelFilesBrief& l0 = vstorage->LevelFilesBrief(0); + // L0 should not have files + if (l0.num_files > 1) { + return Status::NotSupported("L0 contain more than 1 file"); + } + if (l0.num_files == 1) { + if (vstorage->num_non_empty_levels() > 1) { + return Status::NotSupported("Both L0 and other level contain files"); + } + files_ = l0; + return Status::OK(); + } + + for (int i = 1; i < vstorage->num_non_empty_levels() - 1; ++i) { + if (vstorage->LevelFilesBrief(i).num_files > 0) { + return Status::NotSupported("Other levels also contain files"); + } + } + + int level = vstorage->num_non_empty_levels() - 1; + if (vstorage->LevelFilesBrief(level).num_files > 0) { + files_ = vstorage->LevelFilesBrief(level); + return Status::OK(); + } + return Status::NotSupported("no file exists"); +} + +Status CompactedDBImpl::Open(const Options& options, + const std::string& dbname, DB** dbptr) { + *dbptr = nullptr; + + if (options.max_open_files != -1) { + return Status::InvalidArgument("require max_open_files = -1"); + } + if (options.merge_operator.get() != nullptr) { + return Status::InvalidArgument("merge operator is not supported"); + } + DBOptions db_options(options); + std::unique_ptr db(new CompactedDBImpl(db_options, dbname)); + Status s = db->Init(options); + if (s.ok()) { + db->StartTimedTasks(); + ROCKS_LOG_INFO(db->immutable_db_options_.info_log, + "Opened the db as fully compacted mode"); + LogFlush(db->immutable_db_options_.info_log); + *dbptr = db.release(); + } + return s; +} + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/db/compacted_db_impl.h b/src/rocksdb/db/compacted_db_impl.h new file mode 100644 index 000000000..7099566fc --- /dev/null +++ b/src/rocksdb/db/compacted_db_impl.h @@ -0,0 +1,113 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE +#include +#include +#include "db/db_impl/db_impl.h" + +namespace ROCKSDB_NAMESPACE { + +class CompactedDBImpl : public DBImpl { + public: + CompactedDBImpl(const DBOptions& options, const std::string& dbname); + // No copying allowed + CompactedDBImpl(const CompactedDBImpl&) = delete; + void operator=(const CompactedDBImpl&) = delete; + + virtual ~CompactedDBImpl(); + + static Status Open(const Options& options, const std::string& dbname, + DB** dbptr); + + // Implementations of the DB interface + using DB::Get; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) override; + using DB::MultiGet; + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector&, + const std::vector& keys, std::vector* values) + override; + + using DBImpl::Put; + virtual Status Put(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/, const Slice& /*value*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + using DBImpl::Merge; + virtual Status Merge(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/, const Slice& /*value*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + using DBImpl::Delete; + virtual Status Delete(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + virtual Status Write(const WriteOptions& /*options*/, + WriteBatch* /*updates*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + using DBImpl::CompactRange; + virtual Status CompactRange(const CompactRangeOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice* /*begin*/, + const Slice* /*end*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + + virtual Status DisableFileDeletions() override { + return Status::NotSupported("Not supported in compacted db mode."); + } + virtual Status EnableFileDeletions(bool /*force*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + virtual Status GetLiveFiles(std::vector& ret, + uint64_t* manifest_file_size, + bool /*flush_memtable*/) override { + return DBImpl::GetLiveFiles(ret, manifest_file_size, + false /* flush_memtable */); + } + using DBImpl::Flush; + virtual Status Flush(const FlushOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + using DB::IngestExternalFile; + virtual Status IngestExternalFile( + ColumnFamilyHandle* /*column_family*/, + const std::vector& /*external_files*/, + const IngestExternalFileOptions& /*ingestion_options*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + using DB::CreateColumnFamilyWithImport; + virtual Status CreateColumnFamilyWithImport( + const ColumnFamilyOptions& /*options*/, + const std::string& /*column_family_name*/, + const ImportColumnFamilyOptions& /*import_options*/, + const ExportImportFilesMetaData& /*metadata*/, + ColumnFamilyHandle** /*handle*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + + private: + friend class DB; + inline size_t FindFile(const Slice& key); + Status Init(const Options& options); + + ColumnFamilyData* cfd_; + Version* version_; + const Comparator* user_comparator_; + LevelFilesBrief files_; +}; +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/db/compaction/compaction.cc b/src/rocksdb/db/compaction/compaction.cc new file mode 100644 index 000000000..5c34fdcaa --- /dev/null +++ b/src/rocksdb/db/compaction/compaction.cc @@ -0,0 +1,564 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include + +#include "db/column_family.h" +#include "db/compaction/compaction.h" +#include "rocksdb/compaction_filter.h" +#include "test_util/sync_point.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +const uint64_t kRangeTombstoneSentinel = + PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion); + +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a, + const InternalKey& b) { + auto c = user_cmp->Compare(a.user_key(), b.user_key()); + if (c != 0) { + return c; + } + auto a_footer = ExtractInternalKeyFooter(a.Encode()); + auto b_footer = ExtractInternalKeyFooter(b.Encode()); + if (a_footer == kRangeTombstoneSentinel) { + if (b_footer != kRangeTombstoneSentinel) { + return -1; + } + } else if (b_footer == kRangeTombstoneSentinel) { + return 1; + } + return 0; +} + +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a, + const InternalKey& b) { + if (a == nullptr) { + return -1; + } + return sstableKeyCompare(user_cmp, *a, b); +} + +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a, + const InternalKey* b) { + if (b == nullptr) { + return -1; + } + return sstableKeyCompare(user_cmp, a, *b); +} + +uint64_t TotalFileSize(const std::vector& files) { + uint64_t sum = 0; + for (size_t i = 0; i < files.size() && files[i]; i++) { + sum += files[i]->fd.GetFileSize(); + } + return sum; +} + +void Compaction::SetInputVersion(Version* _input_version) { + input_version_ = _input_version; + cfd_ = input_version_->cfd(); + + cfd_->Ref(); + input_version_->Ref(); + edit_.SetColumnFamily(cfd_->GetID()); +} + +void Compaction::GetBoundaryKeys( + VersionStorageInfo* vstorage, + const std::vector& inputs, Slice* smallest_user_key, + Slice* largest_user_key) { + bool initialized = false; + const Comparator* ucmp = vstorage->InternalComparator()->user_comparator(); + for (size_t i = 0; i < inputs.size(); ++i) { + if (inputs[i].files.empty()) { + continue; + } + if (inputs[i].level == 0) { + // we need to consider all files on level 0 + for (const auto* f : inputs[i].files) { + const Slice& start_user_key = f->smallest.user_key(); + if (!initialized || + ucmp->Compare(start_user_key, *smallest_user_key) < 0) { + *smallest_user_key = start_user_key; + } + const Slice& end_user_key = f->largest.user_key(); + if (!initialized || + ucmp->Compare(end_user_key, *largest_user_key) > 0) { + *largest_user_key = end_user_key; + } + initialized = true; + } + } else { + // we only need to consider the first and last file + const Slice& start_user_key = inputs[i].files[0]->smallest.user_key(); + if (!initialized || + ucmp->Compare(start_user_key, *smallest_user_key) < 0) { + *smallest_user_key = start_user_key; + } + const Slice& end_user_key = inputs[i].files.back()->largest.user_key(); + if (!initialized || ucmp->Compare(end_user_key, *largest_user_key) > 0) { + *largest_user_key = end_user_key; + } + initialized = true; + } + } +} + +std::vector Compaction::PopulateWithAtomicBoundaries( + VersionStorageInfo* vstorage, std::vector inputs) { + const Comparator* ucmp = vstorage->InternalComparator()->user_comparator(); + for (size_t i = 0; i < inputs.size(); i++) { + if (inputs[i].level == 0 || inputs[i].files.empty()) { + continue; + } + inputs[i].atomic_compaction_unit_boundaries.reserve(inputs[i].files.size()); + AtomicCompactionUnitBoundary cur_boundary; + size_t first_atomic_idx = 0; + auto add_unit_boundary = [&](size_t to) { + if (first_atomic_idx == to) return; + for (size_t k = first_atomic_idx; k < to; k++) { + inputs[i].atomic_compaction_unit_boundaries.push_back(cur_boundary); + } + first_atomic_idx = to; + }; + for (size_t j = 0; j < inputs[i].files.size(); j++) { + const auto* f = inputs[i].files[j]; + if (j == 0) { + // First file in a level. + cur_boundary.smallest = &f->smallest; + cur_boundary.largest = &f->largest; + } else if (sstableKeyCompare(ucmp, *cur_boundary.largest, f->smallest) == + 0) { + // SSTs overlap but the end key of the previous file was not + // artificially extended by a range tombstone. Extend the current + // boundary. + cur_boundary.largest = &f->largest; + } else { + // Atomic compaction unit has ended. + add_unit_boundary(j); + cur_boundary.smallest = &f->smallest; + cur_boundary.largest = &f->largest; + } + } + add_unit_boundary(inputs[i].files.size()); + assert(inputs[i].files.size() == + inputs[i].atomic_compaction_unit_boundaries.size()); + } + return inputs; +} + +// helper function to determine if compaction is creating files at the +// bottommost level +bool Compaction::IsBottommostLevel( + int output_level, VersionStorageInfo* vstorage, + const std::vector& inputs) { + int output_l0_idx; + if (output_level == 0) { + output_l0_idx = 0; + for (const auto* file : vstorage->LevelFiles(0)) { + if (inputs[0].files.back() == file) { + break; + } + ++output_l0_idx; + } + assert(static_cast(output_l0_idx) < vstorage->LevelFiles(0).size()); + } else { + output_l0_idx = -1; + } + Slice smallest_key, largest_key; + GetBoundaryKeys(vstorage, inputs, &smallest_key, &largest_key); + return !vstorage->RangeMightExistAfterSortedRun(smallest_key, largest_key, + output_level, output_l0_idx); +} + +// test function to validate the functionality of IsBottommostLevel() +// function -- determines if compaction with inputs and storage is bottommost +bool Compaction::TEST_IsBottommostLevel( + int output_level, VersionStorageInfo* vstorage, + const std::vector& inputs) { + return IsBottommostLevel(output_level, vstorage, inputs); +} + +bool Compaction::IsFullCompaction( + VersionStorageInfo* vstorage, + const std::vector& inputs) { + size_t num_files_in_compaction = 0; + size_t total_num_files = 0; + for (int l = 0; l < vstorage->num_levels(); l++) { + total_num_files += vstorage->NumLevelFiles(l); + } + for (size_t i = 0; i < inputs.size(); i++) { + num_files_in_compaction += inputs[i].size(); + } + return num_files_in_compaction == total_num_files; +} + +Compaction::Compaction(VersionStorageInfo* vstorage, + const ImmutableCFOptions& _immutable_cf_options, + const MutableCFOptions& _mutable_cf_options, + std::vector _inputs, + int _output_level, uint64_t _target_file_size, + uint64_t _max_compaction_bytes, uint32_t _output_path_id, + CompressionType _compression, + CompressionOptions _compression_opts, + uint32_t _max_subcompactions, + std::vector _grandparents, + bool _manual_compaction, double _score, + bool _deletion_compaction, + CompactionReason _compaction_reason) + : input_vstorage_(vstorage), + start_level_(_inputs[0].level), + output_level_(_output_level), + max_output_file_size_(_target_file_size), + max_compaction_bytes_(_max_compaction_bytes), + max_subcompactions_(_max_subcompactions), + immutable_cf_options_(_immutable_cf_options), + mutable_cf_options_(_mutable_cf_options), + input_version_(nullptr), + number_levels_(vstorage->num_levels()), + cfd_(nullptr), + output_path_id_(_output_path_id), + output_compression_(_compression), + output_compression_opts_(_compression_opts), + deletion_compaction_(_deletion_compaction), + inputs_(PopulateWithAtomicBoundaries(vstorage, std::move(_inputs))), + grandparents_(std::move(_grandparents)), + score_(_score), + bottommost_level_(IsBottommostLevel(output_level_, vstorage, inputs_)), + is_full_compaction_(IsFullCompaction(vstorage, inputs_)), + is_manual_compaction_(_manual_compaction), + is_trivial_move_(false), + compaction_reason_(_compaction_reason) { + MarkFilesBeingCompacted(true); + if (is_manual_compaction_) { + compaction_reason_ = CompactionReason::kManualCompaction; + } + if (max_subcompactions_ == 0) { + max_subcompactions_ = immutable_cf_options_.max_subcompactions; + } + if (!bottommost_level_) { + // Currently we only enable dictionary compression during compaction to the + // bottommost level. + output_compression_opts_.max_dict_bytes = 0; + output_compression_opts_.zstd_max_train_bytes = 0; + } + +#ifndef NDEBUG + for (size_t i = 1; i < inputs_.size(); ++i) { + assert(inputs_[i].level > inputs_[i - 1].level); + } +#endif + + // setup input_levels_ + { + input_levels_.resize(num_input_levels()); + for (size_t which = 0; which < num_input_levels(); which++) { + DoGenerateLevelFilesBrief(&input_levels_[which], inputs_[which].files, + &arena_); + } + } + + GetBoundaryKeys(vstorage, inputs_, &smallest_user_key_, &largest_user_key_); +} + +Compaction::~Compaction() { + if (input_version_ != nullptr) { + input_version_->Unref(); + } + if (cfd_ != nullptr) { + cfd_->UnrefAndTryDelete(); + } +} + +bool Compaction::InputCompressionMatchesOutput() const { + int base_level = input_vstorage_->base_level(); + bool matches = (GetCompressionType(immutable_cf_options_, input_vstorage_, + mutable_cf_options_, start_level_, + base_level) == output_compression_); + if (matches) { + TEST_SYNC_POINT("Compaction::InputCompressionMatchesOutput:Matches"); + return true; + } + TEST_SYNC_POINT("Compaction::InputCompressionMatchesOutput:DidntMatch"); + return matches; +} + +bool Compaction::IsTrivialMove() const { + // Avoid a move if there is lots of overlapping grandparent data. + // Otherwise, the move could create a parent file that will require + // a very expensive merge later on. + // If start_level_== output_level_, the purpose is to force compaction + // filter to be applied to that level, and thus cannot be a trivial move. + + // Check if start level have files with overlapping ranges + if (start_level_ == 0 && input_vstorage_->level0_non_overlapping() == false) { + // We cannot move files from L0 to L1 if the files are overlapping + return false; + } + + if (is_manual_compaction_ && + (immutable_cf_options_.compaction_filter != nullptr || + immutable_cf_options_.compaction_filter_factory != nullptr)) { + // This is a manual compaction and we have a compaction filter that should + // be executed, we cannot do a trivial move + return false; + } + + // Used in universal compaction, where trivial move can be done if the + // input files are non overlapping + if ((mutable_cf_options_.compaction_options_universal.allow_trivial_move) && + (output_level_ != 0)) { + return is_trivial_move_; + } + + if (!(start_level_ != output_level_ && num_input_levels() == 1 && + input(0, 0)->fd.GetPathId() == output_path_id() && + InputCompressionMatchesOutput())) { + return false; + } + + // assert inputs_.size() == 1 + + for (const auto& file : inputs_.front().files) { + std::vector file_grand_parents; + if (output_level_ + 1 >= number_levels_) { + continue; + } + input_vstorage_->GetOverlappingInputs(output_level_ + 1, &file->smallest, + &file->largest, &file_grand_parents); + const auto compaction_size = + file->fd.GetFileSize() + TotalFileSize(file_grand_parents); + if (compaction_size > max_compaction_bytes_) { + return false; + } + } + + return true; +} + +void Compaction::AddInputDeletions(VersionEdit* out_edit) { + for (size_t which = 0; which < num_input_levels(); which++) { + for (size_t i = 0; i < inputs_[which].size(); i++) { + out_edit->DeleteFile(level(which), inputs_[which][i]->fd.GetNumber()); + } + } +} + +bool Compaction::KeyNotExistsBeyondOutputLevel( + const Slice& user_key, std::vector* level_ptrs) const { + assert(input_version_ != nullptr); + assert(level_ptrs != nullptr); + assert(level_ptrs->size() == static_cast(number_levels_)); + if (bottommost_level_) { + return true; + } else if (output_level_ != 0 && + cfd_->ioptions()->compaction_style == kCompactionStyleLevel) { + // Maybe use binary search to find right entry instead of linear search? + const Comparator* user_cmp = cfd_->user_comparator(); + for (int lvl = output_level_ + 1; lvl < number_levels_; lvl++) { + const std::vector& files = + input_vstorage_->LevelFiles(lvl); + for (; level_ptrs->at(lvl) < files.size(); level_ptrs->at(lvl)++) { + auto* f = files[level_ptrs->at(lvl)]; + if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) { + // We've advanced far enough + if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) { + // Key falls in this file's range, so it may + // exist beyond output level + return false; + } + break; + } + } + } + return true; + } + return false; +} + +// Mark (or clear) each file that is being compacted +void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) { + for (size_t i = 0; i < num_input_levels(); i++) { + for (size_t j = 0; j < inputs_[i].size(); j++) { + assert(mark_as_compacted ? !inputs_[i][j]->being_compacted + : inputs_[i][j]->being_compacted); + inputs_[i][j]->being_compacted = mark_as_compacted; + } + } +} + +// Sample output: +// If compacting 3 L0 files, 2 L3 files and 1 L4 file, and outputting to L5, +// print: "3@0 + 2@3 + 1@4 files to L5" +const char* Compaction::InputLevelSummary( + InputLevelSummaryBuffer* scratch) const { + int len = 0; + bool is_first = true; + for (auto& input_level : inputs_) { + if (input_level.empty()) { + continue; + } + if (!is_first) { + len += + snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, " + "); + len = std::min(len, static_cast(sizeof(scratch->buffer))); + } else { + is_first = false; + } + len += snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, + "%" ROCKSDB_PRIszt "@%d", input_level.size(), + input_level.level); + len = std::min(len, static_cast(sizeof(scratch->buffer))); + } + snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, + " files to L%d", output_level()); + + return scratch->buffer; +} + +uint64_t Compaction::CalculateTotalInputSize() const { + uint64_t size = 0; + for (auto& input_level : inputs_) { + for (auto f : input_level.files) { + size += f->fd.GetFileSize(); + } + } + return size; +} + +void Compaction::ReleaseCompactionFiles(Status status) { + MarkFilesBeingCompacted(false); + cfd_->compaction_picker()->ReleaseCompactionFiles(this, status); +} + +void Compaction::ResetNextCompactionIndex() { + assert(input_version_ != nullptr); + input_vstorage_->ResetNextCompactionIndex(start_level_); +} + +namespace { +int InputSummary(const std::vector& files, char* output, + int len) { + *output = '\0'; + int write = 0; + for (size_t i = 0; i < files.size(); i++) { + int sz = len - write; + int ret; + char sztxt[16]; + AppendHumanBytes(files.at(i)->fd.GetFileSize(), sztxt, 16); + ret = snprintf(output + write, sz, "%" PRIu64 "(%s) ", + files.at(i)->fd.GetNumber(), sztxt); + if (ret < 0 || ret >= sz) break; + write += ret; + } + // if files.size() is non-zero, overwrite the last space + return write - !!files.size(); +} +} // namespace + +void Compaction::Summary(char* output, int len) { + int write = + snprintf(output, len, "Base version %" PRIu64 " Base level %d, inputs: [", + input_version_->GetVersionNumber(), start_level_); + if (write < 0 || write >= len) { + return; + } + + for (size_t level_iter = 0; level_iter < num_input_levels(); ++level_iter) { + if (level_iter > 0) { + write += snprintf(output + write, len - write, "], ["); + if (write < 0 || write >= len) { + return; + } + } + write += + InputSummary(inputs_[level_iter].files, output + write, len - write); + if (write < 0 || write >= len) { + return; + } + } + + snprintf(output + write, len - write, "]"); +} + +uint64_t Compaction::OutputFilePreallocationSize() const { + uint64_t preallocation_size = 0; + + for (const auto& level_files : inputs_) { + for (const auto& file : level_files.files) { + preallocation_size += file->fd.GetFileSize(); + } + } + + if (max_output_file_size_ != port::kMaxUint64 && + (immutable_cf_options_.compaction_style == kCompactionStyleLevel || + output_level() > 0)) { + preallocation_size = std::min(max_output_file_size_, preallocation_size); + } + + // Over-estimate slightly so we don't end up just barely crossing + // the threshold + // No point to prellocate more than 1GB. + return std::min(uint64_t{1073741824}, + preallocation_size + (preallocation_size / 10)); +} + +std::unique_ptr Compaction::CreateCompactionFilter() const { + if (!cfd_->ioptions()->compaction_filter_factory) { + return nullptr; + } + + CompactionFilter::Context context; + context.is_full_compaction = is_full_compaction_; + context.is_manual_compaction = is_manual_compaction_; + context.column_family_id = cfd_->GetID(); + return cfd_->ioptions()->compaction_filter_factory->CreateCompactionFilter( + context); +} + +bool Compaction::IsOutputLevelEmpty() const { + return inputs_.back().level != output_level_ || inputs_.back().empty(); +} + +bool Compaction::ShouldFormSubcompactions() const { + if (max_subcompactions_ <= 1 || cfd_ == nullptr) { + return false; + } + if (cfd_->ioptions()->compaction_style == kCompactionStyleLevel) { + return (start_level_ == 0 || is_manual_compaction_) && output_level_ > 0 && + !IsOutputLevelEmpty(); + } else if (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) { + return number_levels_ > 1 && output_level_ > 0; + } else { + return false; + } +} + +uint64_t Compaction::MinInputFileOldestAncesterTime() const { + uint64_t min_oldest_ancester_time = port::kMaxUint64; + for (const auto& level_files : inputs_) { + for (const auto& file : level_files.files) { + uint64_t oldest_ancester_time = file->TryGetOldestAncesterTime(); + if (oldest_ancester_time != 0) { + min_oldest_ancester_time = + std::min(min_oldest_ancester_time, oldest_ancester_time); + } + } + } + return min_oldest_ancester_time; +} + +int Compaction::GetInputBaseLevel() const { + return input_vstorage_->base_level(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction.h b/src/rocksdb/db/compaction/compaction.h new file mode 100644 index 000000000..9358e50ff --- /dev/null +++ b/src/rocksdb/db/compaction/compaction.h @@ -0,0 +1,384 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "db/version_set.h" +#include "memory/arena.h" +#include "options/cf_options.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { +// The file contains class Compaction, as well as some helper functions +// and data structures used by the class. + +// Utility for comparing sstable boundary keys. Returns -1 if either a or b is +// null which provides the property that a==null indicates a key that is less +// than any key and b==null indicates a key that is greater than any key. Note +// that the comparison is performed primarily on the user-key portion of the +// key. If the user-keys compare equal, an additional test is made to sort +// range tombstone sentinel keys before other keys with the same user-key. The +// result is that 2 user-keys will compare equal if they differ purely on +// their sequence number and value, but the range tombstone sentinel for that +// user-key will compare not equal. This is necessary because the range +// tombstone sentinel key is set as the largest key for an sstable even though +// that key never appears in the database. We don't want adjacent sstables to +// be considered overlapping if they are separated by the range tombstone +// sentinel. +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a, + const InternalKey& b); +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a, + const InternalKey& b); +int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a, + const InternalKey* b); + +// An AtomicCompactionUnitBoundary represents a range of keys [smallest, +// largest] that exactly spans one ore more neighbouring SSTs on the same +// level. Every pair of SSTs in this range "overlap" (i.e., the largest +// user key of one file is the smallest user key of the next file). These +// boundaries are propagated down to RangeDelAggregator during compaction +// to provide safe truncation boundaries for range tombstones. +struct AtomicCompactionUnitBoundary { + const InternalKey* smallest = nullptr; + const InternalKey* largest = nullptr; +}; + +// The structure that manages compaction input files associated +// with the same physical level. +struct CompactionInputFiles { + int level; + std::vector files; + std::vector atomic_compaction_unit_boundaries; + inline bool empty() const { return files.empty(); } + inline size_t size() const { return files.size(); } + inline void clear() { files.clear(); } + inline FileMetaData* operator[](size_t i) const { return files[i]; } +}; + +class Version; +class ColumnFamilyData; +class VersionStorageInfo; +class CompactionFilter; + +// A Compaction encapsulates metadata about a compaction. +class Compaction { + public: + Compaction(VersionStorageInfo* input_version, + const ImmutableCFOptions& immutable_cf_options, + const MutableCFOptions& mutable_cf_options, + std::vector inputs, int output_level, + uint64_t target_file_size, uint64_t max_compaction_bytes, + uint32_t output_path_id, CompressionType compression, + CompressionOptions compression_opts, uint32_t max_subcompactions, + std::vector grandparents, + bool manual_compaction = false, double score = -1, + bool deletion_compaction = false, + CompactionReason compaction_reason = CompactionReason::kUnknown); + + // No copying allowed + Compaction(const Compaction&) = delete; + void operator=(const Compaction&) = delete; + + ~Compaction(); + + // Returns the level associated to the specified compaction input level. + // If compaction_input_level is not specified, then input_level is set to 0. + int level(size_t compaction_input_level = 0) const { + return inputs_[compaction_input_level].level; + } + + int start_level() const { return start_level_; } + + // Outputs will go to this level + int output_level() const { return output_level_; } + + // Returns the number of input levels in this compaction. + size_t num_input_levels() const { return inputs_.size(); } + + // Return the object that holds the edits to the descriptor done + // by this compaction. + VersionEdit* edit() { return &edit_; } + + // Returns the number of input files associated to the specified + // compaction input level. + // The function will return 0 if when "compaction_input_level" < 0 + // or "compaction_input_level" >= "num_input_levels()". + size_t num_input_files(size_t compaction_input_level) const { + if (compaction_input_level < inputs_.size()) { + return inputs_[compaction_input_level].size(); + } + return 0; + } + + // Returns input version of the compaction + Version* input_version() const { return input_version_; } + + // Returns the ColumnFamilyData associated with the compaction. + ColumnFamilyData* column_family_data() const { return cfd_; } + + // Returns the file meta data of the 'i'th input file at the + // specified compaction input level. + // REQUIREMENT: "compaction_input_level" must be >= 0 and + // < "input_levels()" + FileMetaData* input(size_t compaction_input_level, size_t i) const { + assert(compaction_input_level < inputs_.size()); + return inputs_[compaction_input_level][i]; + } + + const std::vector* boundaries( + size_t compaction_input_level) const { + assert(compaction_input_level < inputs_.size()); + return &inputs_[compaction_input_level].atomic_compaction_unit_boundaries; + } + + // Returns the list of file meta data of the specified compaction + // input level. + // REQUIREMENT: "compaction_input_level" must be >= 0 and + // < "input_levels()" + const std::vector* inputs( + size_t compaction_input_level) const { + assert(compaction_input_level < inputs_.size()); + return &inputs_[compaction_input_level].files; + } + + const std::vector* inputs() { return &inputs_; } + + // Returns the LevelFilesBrief of the specified compaction input level. + const LevelFilesBrief* input_levels(size_t compaction_input_level) const { + return &input_levels_[compaction_input_level]; + } + + // Maximum size of files to build during this compaction. + uint64_t max_output_file_size() const { return max_output_file_size_; } + + // What compression for output + CompressionType output_compression() const { return output_compression_; } + + // What compression options for output + CompressionOptions output_compression_opts() const { + return output_compression_opts_; + } + + // Whether need to write output file to second DB path. + uint32_t output_path_id() const { return output_path_id_; } + + // Is this a trivial compaction that can be implemented by just + // moving a single input file to the next level (no merging or splitting) + bool IsTrivialMove() const; + + // If true, then the compaction can be done by simply deleting input files. + bool deletion_compaction() const { return deletion_compaction_; } + + // Add all inputs to this compaction as delete operations to *edit. + void AddInputDeletions(VersionEdit* edit); + + // Returns true if the available information we have guarantees that + // the input "user_key" does not exist in any level beyond "output_level()". + bool KeyNotExistsBeyondOutputLevel(const Slice& user_key, + std::vector* level_ptrs) const; + + // Clear all files to indicate that they are not being compacted + // Delete this compaction from the list of running compactions. + // + // Requirement: DB mutex held + void ReleaseCompactionFiles(Status status); + + // Returns the summary of the compaction in "output" with maximum "len" + // in bytes. The caller is responsible for the memory management of + // "output". + void Summary(char* output, int len); + + // Return the score that was used to pick this compaction run. + double score() const { return score_; } + + // Is this compaction creating a file in the bottom most level? + bool bottommost_level() const { return bottommost_level_; } + + // Does this compaction include all sst files? + bool is_full_compaction() const { return is_full_compaction_; } + + // Was this compaction triggered manually by the client? + bool is_manual_compaction() const { return is_manual_compaction_; } + + // Used when allow_trivial_move option is set in + // Universal compaction. If all the input files are + // non overlapping, then is_trivial_move_ variable + // will be set true, else false + void set_is_trivial_move(bool trivial_move) { + is_trivial_move_ = trivial_move; + } + + // Used when allow_trivial_move option is set in + // Universal compaction. Returns true, if the input files + // are non-overlapping and can be trivially moved. + bool is_trivial_move() const { return is_trivial_move_; } + + // How many total levels are there? + int number_levels() const { return number_levels_; } + + // Return the ImmutableCFOptions that should be used throughout the compaction + // procedure + const ImmutableCFOptions* immutable_cf_options() const { + return &immutable_cf_options_; + } + + // Return the MutableCFOptions that should be used throughout the compaction + // procedure + const MutableCFOptions* mutable_cf_options() const { + return &mutable_cf_options_; + } + + // Returns the size in bytes that the output file should be preallocated to. + // In level compaction, that is max_file_size_. In universal compaction, that + // is the sum of all input file sizes. + uint64_t OutputFilePreallocationSize() const; + + void SetInputVersion(Version* input_version); + + struct InputLevelSummaryBuffer { + char buffer[128]; + }; + + const char* InputLevelSummary(InputLevelSummaryBuffer* scratch) const; + + uint64_t CalculateTotalInputSize() const; + + // In case of compaction error, reset the nextIndex that is used + // to pick up the next file to be compacted from files_by_size_ + void ResetNextCompactionIndex(); + + // Create a CompactionFilter from compaction_filter_factory + std::unique_ptr CreateCompactionFilter() const; + + // Is the input level corresponding to output_level_ empty? + bool IsOutputLevelEmpty() const; + + // Should this compaction be broken up into smaller ones run in parallel? + bool ShouldFormSubcompactions() const; + + // test function to validate the functionality of IsBottommostLevel() + // function -- determines if compaction with inputs and storage is bottommost + static bool TEST_IsBottommostLevel( + int output_level, VersionStorageInfo* vstorage, + const std::vector& inputs); + + TablePropertiesCollection GetOutputTableProperties() const { + return output_table_properties_; + } + + void SetOutputTableProperties(TablePropertiesCollection tp) { + output_table_properties_ = std::move(tp); + } + + Slice GetSmallestUserKey() const { return smallest_user_key_; } + + Slice GetLargestUserKey() const { return largest_user_key_; } + + int GetInputBaseLevel() const; + + CompactionReason compaction_reason() { return compaction_reason_; } + + const std::vector& grandparents() const { + return grandparents_; + } + + uint64_t max_compaction_bytes() const { return max_compaction_bytes_; } + + uint32_t max_subcompactions() const { return max_subcompactions_; } + + uint64_t MinInputFileOldestAncesterTime() const; + + private: + // mark (or clear) all files that are being compacted + void MarkFilesBeingCompacted(bool mark_as_compacted); + + // get the smallest and largest key present in files to be compacted + static void GetBoundaryKeys(VersionStorageInfo* vstorage, + const std::vector& inputs, + Slice* smallest_key, Slice* largest_key); + + // Get the atomic file boundaries for all files in the compaction. Necessary + // in order to avoid the scenario described in + // https://github.com/facebook/rocksdb/pull/4432#discussion_r221072219 and plumb + // down appropriate key boundaries to RangeDelAggregator during compaction. + static std::vector PopulateWithAtomicBoundaries( + VersionStorageInfo* vstorage, std::vector inputs); + + // helper function to determine if compaction with inputs and storage is + // bottommost + static bool IsBottommostLevel( + int output_level, VersionStorageInfo* vstorage, + const std::vector& inputs); + + static bool IsFullCompaction(VersionStorageInfo* vstorage, + const std::vector& inputs); + + VersionStorageInfo* input_vstorage_; + + const int start_level_; // the lowest level to be compacted + const int output_level_; // levels to which output files are stored + uint64_t max_output_file_size_; + uint64_t max_compaction_bytes_; + uint32_t max_subcompactions_; + const ImmutableCFOptions immutable_cf_options_; + const MutableCFOptions mutable_cf_options_; + Version* input_version_; + VersionEdit edit_; + const int number_levels_; + ColumnFamilyData* cfd_; + Arena arena_; // Arena used to allocate space for file_levels_ + + const uint32_t output_path_id_; + CompressionType output_compression_; + CompressionOptions output_compression_opts_; + // If true, then the comaction can be done by simply deleting input files. + const bool deletion_compaction_; + + // Compaction input files organized by level. Constant after construction + const std::vector inputs_; + + // A copy of inputs_, organized more closely in memory + autovector input_levels_; + + // State used to check for number of overlapping grandparent files + // (grandparent == "output_level_ + 1") + std::vector grandparents_; + const double score_; // score that was used to pick this compaction. + + // Is this compaction creating a file in the bottom most level? + const bool bottommost_level_; + // Does this compaction include all sst files? + const bool is_full_compaction_; + + // Is this compaction requested by the client? + const bool is_manual_compaction_; + + // True if we can do trivial move in Universal multi level + // compaction + bool is_trivial_move_; + + // Does input compression match the output compression? + bool InputCompressionMatchesOutput() const; + + // table properties of output files + TablePropertiesCollection output_table_properties_; + + // smallest user keys in compaction + Slice smallest_user_key_; + + // largest user keys in compaction + Slice largest_user_key_; + + // Reason for compaction + CompactionReason compaction_reason_; +}; + +// Return sum of sizes of all files in `files`. +extern uint64_t TotalFileSize(const std::vector& files); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_iteration_stats.h b/src/rocksdb/db/compaction/compaction_iteration_stats.h new file mode 100644 index 000000000..963c1d8eb --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_iteration_stats.h @@ -0,0 +1,37 @@ +// Copyright (c) 2016-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/rocksdb_namespace.h" + +struct CompactionIterationStats { + // Compaction statistics + + // Doesn't include records skipped because of + // CompactionFilter::Decision::kRemoveAndSkipUntil. + int64_t num_record_drop_user = 0; + + int64_t num_record_drop_hidden = 0; + int64_t num_record_drop_obsolete = 0; + int64_t num_record_drop_range_del = 0; + int64_t num_range_del_drop_obsolete = 0; + // Deletions obsoleted before bottom level due to file gap optimization. + int64_t num_optimized_del_drop_obsolete = 0; + uint64_t total_filter_time = 0; + + // Input statistics + // TODO(noetzli): The stats are incomplete. They are lacking everything + // consumed by MergeHelper. + uint64_t num_input_records = 0; + uint64_t num_input_deletion_records = 0; + uint64_t num_input_corrupt_records = 0; + uint64_t total_input_raw_key_bytes = 0; + uint64_t total_input_raw_value_bytes = 0; + + // Single-Delete diagnostics for exceptional situations + uint64_t num_single_del_fallthru = 0; + uint64_t num_single_del_mismatch = 0; +}; diff --git a/src/rocksdb/db/compaction/compaction_iterator.cc b/src/rocksdb/db/compaction/compaction_iterator.cc new file mode 100644 index 000000000..1bebfc717 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_iterator.cc @@ -0,0 +1,774 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include + +#include "db/compaction/compaction_iterator.h" +#include "db/snapshot_checker.h" +#include "port/likely.h" +#include "rocksdb/listener.h" +#include "table/internal_iterator.h" +#include "test_util/sync_point.h" + +#define DEFINITELY_IN_SNAPSHOT(seq, snapshot) \ + ((seq) <= (snapshot) && \ + (snapshot_checker_ == nullptr || \ + LIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) == \ + SnapshotCheckerResult::kInSnapshot))) + +#define DEFINITELY_NOT_IN_SNAPSHOT(seq, snapshot) \ + ((seq) > (snapshot) || \ + (snapshot_checker_ != nullptr && \ + UNLIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) == \ + SnapshotCheckerResult::kNotInSnapshot))) + +#define IN_EARLIEST_SNAPSHOT(seq) \ + ((seq) <= earliest_snapshot_ && \ + (snapshot_checker_ == nullptr || LIKELY(IsInEarliestSnapshot(seq)))) + +namespace ROCKSDB_NAMESPACE { + +CompactionIterator::CompactionIterator( + InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, + SequenceNumber last_sequence, std::vector* snapshots, + SequenceNumber earliest_write_conflict_snapshot, + const SnapshotChecker* snapshot_checker, Env* env, + bool report_detailed_time, bool expect_valid_internal_key, + CompactionRangeDelAggregator* range_del_agg, const Compaction* compaction, + const CompactionFilter* compaction_filter, + const std::atomic* shutting_down, + const SequenceNumber preserve_deletes_seqnum, + const std::atomic* manual_compaction_paused, + const std::shared_ptr info_log) + : CompactionIterator( + input, cmp, merge_helper, last_sequence, snapshots, + earliest_write_conflict_snapshot, snapshot_checker, env, + report_detailed_time, expect_valid_internal_key, range_del_agg, + std::unique_ptr( + compaction ? new CompactionProxy(compaction) : nullptr), + compaction_filter, shutting_down, preserve_deletes_seqnum, + manual_compaction_paused, info_log) {} + +CompactionIterator::CompactionIterator( + InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, + SequenceNumber /*last_sequence*/, std::vector* snapshots, + SequenceNumber earliest_write_conflict_snapshot, + const SnapshotChecker* snapshot_checker, Env* env, + bool report_detailed_time, bool expect_valid_internal_key, + CompactionRangeDelAggregator* range_del_agg, + std::unique_ptr compaction, + const CompactionFilter* compaction_filter, + const std::atomic* shutting_down, + const SequenceNumber preserve_deletes_seqnum, + const std::atomic* manual_compaction_paused, + const std::shared_ptr info_log) + : input_(input), + cmp_(cmp), + merge_helper_(merge_helper), + snapshots_(snapshots), + earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot), + snapshot_checker_(snapshot_checker), + env_(env), + report_detailed_time_(report_detailed_time), + expect_valid_internal_key_(expect_valid_internal_key), + range_del_agg_(range_del_agg), + compaction_(std::move(compaction)), + compaction_filter_(compaction_filter), + shutting_down_(shutting_down), + manual_compaction_paused_(manual_compaction_paused), + preserve_deletes_seqnum_(preserve_deletes_seqnum), + current_user_key_sequence_(0), + current_user_key_snapshot_(0), + merge_out_iter_(merge_helper_), + current_key_committed_(false), + info_log_(info_log) { + assert(compaction_filter_ == nullptr || compaction_ != nullptr); + assert(snapshots_ != nullptr); + bottommost_level_ = + compaction_ == nullptr ? false : compaction_->bottommost_level(); + if (compaction_ != nullptr) { + level_ptrs_ = std::vector(compaction_->number_levels(), 0); + } + if (snapshots_->size() == 0) { + // optimize for fast path if there are no snapshots + visible_at_tip_ = true; + earliest_snapshot_iter_ = snapshots_->end(); + earliest_snapshot_ = kMaxSequenceNumber; + latest_snapshot_ = 0; + } else { + visible_at_tip_ = false; + earliest_snapshot_iter_ = snapshots_->begin(); + earliest_snapshot_ = snapshots_->at(0); + latest_snapshot_ = snapshots_->back(); + } +#ifndef NDEBUG + // findEarliestVisibleSnapshot assumes this ordering. + for (size_t i = 1; i < snapshots_->size(); ++i) { + assert(snapshots_->at(i - 1) < snapshots_->at(i)); + } +#endif + input_->SetPinnedItersMgr(&pinned_iters_mgr_); + TEST_SYNC_POINT_CALLBACK("CompactionIterator:AfterInit", compaction_.get()); +} + +CompactionIterator::~CompactionIterator() { + // input_ Iteartor lifetime is longer than pinned_iters_mgr_ lifetime + input_->SetPinnedItersMgr(nullptr); +} + +void CompactionIterator::ResetRecordCounts() { + iter_stats_.num_record_drop_user = 0; + iter_stats_.num_record_drop_hidden = 0; + iter_stats_.num_record_drop_obsolete = 0; + iter_stats_.num_record_drop_range_del = 0; + iter_stats_.num_range_del_drop_obsolete = 0; + iter_stats_.num_optimized_del_drop_obsolete = 0; +} + +void CompactionIterator::SeekToFirst() { + NextFromInput(); + PrepareOutput(); +} + +void CompactionIterator::Next() { + // If there is a merge output, return it before continuing to process the + // input. + if (merge_out_iter_.Valid()) { + merge_out_iter_.Next(); + + // Check if we returned all records of the merge output. + if (merge_out_iter_.Valid()) { + key_ = merge_out_iter_.key(); + value_ = merge_out_iter_.value(); + bool valid_key __attribute__((__unused__)); + valid_key = ParseInternalKey(key_, &ikey_); + // MergeUntil stops when it encounters a corrupt key and does not + // include them in the result, so we expect the keys here to be valid. + assert(valid_key); + if (!valid_key) { + ROCKS_LOG_FATAL(info_log_, "Invalid key (%s) in compaction", + key_.ToString(true).c_str()); + } + + // Keep current_key_ in sync. + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + key_ = current_key_.GetInternalKey(); + ikey_.user_key = current_key_.GetUserKey(); + valid_ = true; + } else { + // We consumed all pinned merge operands, release pinned iterators + pinned_iters_mgr_.ReleasePinnedData(); + // MergeHelper moves the iterator to the first record after the merged + // records, so even though we reached the end of the merge output, we do + // not want to advance the iterator. + NextFromInput(); + } + } else { + // Only advance the input iterator if there is no merge output and the + // iterator is not already at the next record. + if (!at_next_) { + input_->Next(); + } + NextFromInput(); + } + + if (valid_) { + // Record that we've outputted a record for the current key. + has_outputted_key_ = true; + } + + PrepareOutput(); +} + +void CompactionIterator::InvokeFilterIfNeeded(bool* need_skip, + Slice* skip_until) { + if (compaction_filter_ != nullptr && + (ikey_.type == kTypeValue || ikey_.type == kTypeBlobIndex)) { + // If the user has specified a compaction filter and the sequence + // number is greater than any external snapshot, then invoke the + // filter. If the return value of the compaction filter is true, + // replace the entry with a deletion marker. + CompactionFilter::Decision filter; + compaction_filter_value_.clear(); + compaction_filter_skip_until_.Clear(); + CompactionFilter::ValueType value_type = + ikey_.type == kTypeValue ? CompactionFilter::ValueType::kValue + : CompactionFilter::ValueType::kBlobIndex; + // Hack: pass internal key to BlobIndexCompactionFilter since it needs + // to get sequence number. + Slice& filter_key = ikey_.type == kTypeValue ? ikey_.user_key : key_; + { + StopWatchNano timer(env_, report_detailed_time_); + filter = compaction_filter_->FilterV2( + compaction_->level(), filter_key, value_type, value_, + &compaction_filter_value_, compaction_filter_skip_until_.rep()); + iter_stats_.total_filter_time += + env_ != nullptr && report_detailed_time_ ? timer.ElapsedNanos() : 0; + } + + if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil && + cmp_->Compare(*compaction_filter_skip_until_.rep(), ikey_.user_key) <= + 0) { + // Can't skip to a key smaller than the current one. + // Keep the key as per FilterV2 documentation. + filter = CompactionFilter::Decision::kKeep; + } + + if (filter == CompactionFilter::Decision::kRemove) { + // convert the current key to a delete; key_ is pointing into + // current_key_ at this point, so updating current_key_ updates key() + ikey_.type = kTypeDeletion; + current_key_.UpdateInternalKey(ikey_.sequence, kTypeDeletion); + // no value associated with delete + value_.clear(); + iter_stats_.num_record_drop_user++; + } else if (filter == CompactionFilter::Decision::kChangeValue) { + value_ = compaction_filter_value_; + } else if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil) { + *need_skip = true; + compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber, + kValueTypeForSeek); + *skip_until = compaction_filter_skip_until_.Encode(); + } + } +} + +void CompactionIterator::NextFromInput() { + at_next_ = false; + valid_ = false; + + while (!valid_ && input_->Valid() && !IsPausingManualCompaction() && + !IsShuttingDown()) { + key_ = input_->key(); + value_ = input_->value(); + iter_stats_.num_input_records++; + + if (!ParseInternalKey(key_, &ikey_)) { + // If `expect_valid_internal_key_` is false, return the corrupted key + // and let the caller decide what to do with it. + // TODO(noetzli): We should have a more elegant solution for this. + if (expect_valid_internal_key_) { + assert(!"Corrupted internal key not expected."); + status_ = Status::Corruption("Corrupted internal key not expected."); + break; + } + key_ = current_key_.SetInternalKey(key_); + has_current_user_key_ = false; + current_user_key_sequence_ = kMaxSequenceNumber; + current_user_key_snapshot_ = 0; + iter_stats_.num_input_corrupt_records++; + valid_ = true; + break; + } + TEST_SYNC_POINT_CALLBACK("CompactionIterator:ProcessKV", &ikey_); + + // Update input statistics + if (ikey_.type == kTypeDeletion || ikey_.type == kTypeSingleDeletion) { + iter_stats_.num_input_deletion_records++; + } + iter_stats_.total_input_raw_key_bytes += key_.size(); + iter_stats_.total_input_raw_value_bytes += value_.size(); + + // If need_skip is true, we should seek the input iterator + // to internal key skip_until and continue from there. + bool need_skip = false; + // Points either into compaction_filter_skip_until_ or into + // merge_helper_->compaction_filter_skip_until_. + Slice skip_until; + + // Check whether the user key changed. After this if statement current_key_ + // is a copy of the current input key (maybe converted to a delete by the + // compaction filter). ikey_.user_key is pointing to the copy. + if (!has_current_user_key_ || + !cmp_->Equal(ikey_.user_key, current_user_key_)) { + // First occurrence of this user key + // Copy key for output + key_ = current_key_.SetInternalKey(key_, &ikey_); + current_user_key_ = ikey_.user_key; + has_current_user_key_ = true; + has_outputted_key_ = false; + current_user_key_sequence_ = kMaxSequenceNumber; + current_user_key_snapshot_ = 0; + current_key_committed_ = KeyCommitted(ikey_.sequence); + + // Apply the compaction filter to the first committed version of the user + // key. + if (current_key_committed_) { + InvokeFilterIfNeeded(&need_skip, &skip_until); + } + } else { + // Update the current key to reflect the new sequence number/type without + // copying the user key. + // TODO(rven): Compaction filter does not process keys in this path + // Need to have the compaction filter process multiple versions + // if we have versions on both sides of a snapshot + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + key_ = current_key_.GetInternalKey(); + ikey_.user_key = current_key_.GetUserKey(); + + // Note that newer version of a key is ordered before older versions. If a + // newer version of a key is committed, so as the older version. No need + // to query snapshot_checker_ in that case. + if (UNLIKELY(!current_key_committed_)) { + assert(snapshot_checker_ != nullptr); + current_key_committed_ = KeyCommitted(ikey_.sequence); + // Apply the compaction filter to the first committed version of the + // user key. + if (current_key_committed_) { + InvokeFilterIfNeeded(&need_skip, &skip_until); + } + } + } + + if (UNLIKELY(!current_key_committed_)) { + assert(snapshot_checker_ != nullptr); + valid_ = true; + break; + } + + // If there are no snapshots, then this kv affect visibility at tip. + // Otherwise, search though all existing snapshots to find the earliest + // snapshot that is affected by this kv. + SequenceNumber last_sequence __attribute__((__unused__)); + last_sequence = current_user_key_sequence_; + current_user_key_sequence_ = ikey_.sequence; + SequenceNumber last_snapshot = current_user_key_snapshot_; + SequenceNumber prev_snapshot = 0; // 0 means no previous snapshot + current_user_key_snapshot_ = + visible_at_tip_ + ? earliest_snapshot_ + : findEarliestVisibleSnapshot(ikey_.sequence, &prev_snapshot); + + if (need_skip) { + // This case is handled below. + } else if (clear_and_output_next_key_) { + // In the previous iteration we encountered a single delete that we could + // not compact out. We will keep this Put, but can drop it's data. + // (See Optimization 3, below.) + assert(ikey_.type == kTypeValue); + if (ikey_.type != kTypeValue) { + ROCKS_LOG_FATAL(info_log_, + "Unexpected key type %d for compaction output", + ikey_.type); + } + assert(current_user_key_snapshot_ == last_snapshot); + if (current_user_key_snapshot_ != last_snapshot) { + ROCKS_LOG_FATAL(info_log_, + "current_user_key_snapshot_ (%" PRIu64 + ") != last_snapshot (%" PRIu64 ")", + current_user_key_snapshot_, last_snapshot); + } + + value_.clear(); + valid_ = true; + clear_and_output_next_key_ = false; + } else if (ikey_.type == kTypeSingleDeletion) { + // We can compact out a SingleDelete if: + // 1) We encounter the corresponding PUT -OR- we know that this key + // doesn't appear past this output level + // =AND= + // 2) We've already returned a record in this snapshot -OR- + // there are no earlier earliest_write_conflict_snapshot. + // + // Rule 1 is needed for SingleDelete correctness. Rule 2 is needed to + // allow Transactions to do write-conflict checking (if we compacted away + // all keys, then we wouldn't know that a write happened in this + // snapshot). If there is no earlier snapshot, then we know that there + // are no active transactions that need to know about any writes. + // + // Optimization 3: + // If we encounter a SingleDelete followed by a PUT and Rule 2 is NOT + // true, then we must output a SingleDelete. In this case, we will decide + // to also output the PUT. While we are compacting less by outputting the + // PUT now, hopefully this will lead to better compaction in the future + // when Rule 2 is later true (Ie, We are hoping we can later compact out + // both the SingleDelete and the Put, while we couldn't if we only + // outputted the SingleDelete now). + // In this case, we can save space by removing the PUT's value as it will + // never be read. + // + // Deletes and Merges are not supported on the same key that has a + // SingleDelete as it is not possible to correctly do any partial + // compaction of such a combination of operations. The result of mixing + // those operations for a given key is documented as being undefined. So + // we can choose how to handle such a combinations of operations. We will + // try to compact out as much as we can in these cases. + // We will report counts on these anomalous cases. + + // The easiest way to process a SingleDelete during iteration is to peek + // ahead at the next key. + ParsedInternalKey next_ikey; + input_->Next(); + + // Check whether the next key exists, is not corrupt, and is the same key + // as the single delete. + if (input_->Valid() && ParseInternalKey(input_->key(), &next_ikey) && + cmp_->Equal(ikey_.user_key, next_ikey.user_key)) { + // Check whether the next key belongs to the same snapshot as the + // SingleDelete. + if (prev_snapshot == 0 || + DEFINITELY_NOT_IN_SNAPSHOT(next_ikey.sequence, prev_snapshot)) { + if (next_ikey.type == kTypeSingleDeletion) { + // We encountered two SingleDeletes in a row. This could be due to + // unexpected user input. + // Skip the first SingleDelete and let the next iteration decide how + // to handle the second SingleDelete + + // First SingleDelete has been skipped since we already called + // input_->Next(). + ++iter_stats_.num_record_drop_obsolete; + ++iter_stats_.num_single_del_mismatch; + } else if (has_outputted_key_ || + DEFINITELY_IN_SNAPSHOT( + ikey_.sequence, earliest_write_conflict_snapshot_)) { + // Found a matching value, we can drop the single delete and the + // value. It is safe to drop both records since we've already + // outputted a key in this snapshot, or there is no earlier + // snapshot (Rule 2 above). + + // Note: it doesn't matter whether the second key is a Put or if it + // is an unexpected Merge or Delete. We will compact it out + // either way. We will maintain counts of how many mismatches + // happened + if (next_ikey.type != kTypeValue && + next_ikey.type != kTypeBlobIndex) { + ++iter_stats_.num_single_del_mismatch; + } + + ++iter_stats_.num_record_drop_hidden; + ++iter_stats_.num_record_drop_obsolete; + // Already called input_->Next() once. Call it a second time to + // skip past the second key. + input_->Next(); + } else { + // Found a matching value, but we cannot drop both keys since + // there is an earlier snapshot and we need to leave behind a record + // to know that a write happened in this snapshot (Rule 2 above). + // Clear the value and output the SingleDelete. (The value will be + // outputted on the next iteration.) + + // Setting valid_ to true will output the current SingleDelete + valid_ = true; + + // Set up the Put to be outputted in the next iteration. + // (Optimization 3). + clear_and_output_next_key_ = true; + } + } else { + // We hit the next snapshot without hitting a put, so the iterator + // returns the single delete. + valid_ = true; + } + } else { + // We are at the end of the input, could not parse the next key, or hit + // a different key. The iterator returns the single delete if the key + // possibly exists beyond the current output level. We set + // has_current_user_key to false so that if the iterator is at the next + // key, we do not compare it again against the previous key at the next + // iteration. If the next key is corrupt, we return before the + // comparison, so the value of has_current_user_key does not matter. + has_current_user_key_ = false; + if (compaction_ != nullptr && IN_EARLIEST_SNAPSHOT(ikey_.sequence) && + compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key, + &level_ptrs_)) { + // Key doesn't exist outside of this range. + // Can compact out this SingleDelete. + ++iter_stats_.num_record_drop_obsolete; + ++iter_stats_.num_single_del_fallthru; + if (!bottommost_level_) { + ++iter_stats_.num_optimized_del_drop_obsolete; + } + } else { + // Output SingleDelete + valid_ = true; + } + } + + if (valid_) { + at_next_ = true; + } + } else if (last_snapshot == current_user_key_snapshot_ || + (last_snapshot > 0 && + last_snapshot < current_user_key_snapshot_)) { + // If the earliest snapshot is which this key is visible in + // is the same as the visibility of a previous instance of the + // same key, then this kv is not visible in any snapshot. + // Hidden by an newer entry for same user key + // + // Note: Dropping this key will not affect TransactionDB write-conflict + // checking since there has already been a record returned for this key + // in this snapshot. + assert(last_sequence >= current_user_key_sequence_); + if (last_sequence < current_user_key_sequence_) { + ROCKS_LOG_FATAL(info_log_, + "last_sequence (%" PRIu64 + ") < current_user_key_sequence_ (%" PRIu64 ")", + last_sequence, current_user_key_sequence_); + } + + ++iter_stats_.num_record_drop_hidden; // (A) + input_->Next(); + } else if (compaction_ != nullptr && ikey_.type == kTypeDeletion && + IN_EARLIEST_SNAPSHOT(ikey_.sequence) && + ikeyNotNeededForIncrementalSnapshot() && + compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key, + &level_ptrs_)) { + // TODO(noetzli): This is the only place where we use compaction_ + // (besides the constructor). We should probably get rid of this + // dependency and find a way to do similar filtering during flushes. + // + // For this user key: + // (1) there is no data in higher levels + // (2) data in lower levels will have larger sequence numbers + // (3) data in layers that are being compacted here and have + // smaller sequence numbers will be dropped in the next + // few iterations of this loop (by rule (A) above). + // Therefore this deletion marker is obsolete and can be dropped. + // + // Note: Dropping this Delete will not affect TransactionDB + // write-conflict checking since it is earlier than any snapshot. + // + // It seems that we can also drop deletion later than earliest snapshot + // given that: + // (1) The deletion is earlier than earliest_write_conflict_snapshot, and + // (2) No value exist earlier than the deletion. + ++iter_stats_.num_record_drop_obsolete; + if (!bottommost_level_) { + ++iter_stats_.num_optimized_del_drop_obsolete; + } + input_->Next(); + } else if ((ikey_.type == kTypeDeletion) && bottommost_level_ && + ikeyNotNeededForIncrementalSnapshot()) { + // Handle the case where we have a delete key at the bottom most level + // We can skip outputting the key iff there are no subsequent puts for this + // key + ParsedInternalKey next_ikey; + input_->Next(); + // Skip over all versions of this key that happen to occur in the same snapshot + // range as the delete + while (input_->Valid() && ParseInternalKey(input_->key(), &next_ikey) && + cmp_->Equal(ikey_.user_key, next_ikey.user_key) && + (prev_snapshot == 0 || + DEFINITELY_NOT_IN_SNAPSHOT(next_ikey.sequence, prev_snapshot))) { + input_->Next(); + } + // If you find you still need to output a row with this key, we need to output the + // delete too + if (input_->Valid() && ParseInternalKey(input_->key(), &next_ikey) && + cmp_->Equal(ikey_.user_key, next_ikey.user_key)) { + valid_ = true; + at_next_ = true; + } + } else if (ikey_.type == kTypeMerge) { + if (!merge_helper_->HasOperator()) { + status_ = Status::InvalidArgument( + "merge_operator is not properly initialized."); + return; + } + + pinned_iters_mgr_.StartPinning(); + // We know the merge type entry is not hidden, otherwise we would + // have hit (A) + // We encapsulate the merge related state machine in a different + // object to minimize change to the existing flow. + Status s = merge_helper_->MergeUntil(input_, range_del_agg_, + prev_snapshot, bottommost_level_); + merge_out_iter_.SeekToFirst(); + + if (!s.ok() && !s.IsMergeInProgress()) { + status_ = s; + return; + } else if (merge_out_iter_.Valid()) { + // NOTE: key, value, and ikey_ refer to old entries. + // These will be correctly set below. + key_ = merge_out_iter_.key(); + value_ = merge_out_iter_.value(); + bool valid_key __attribute__((__unused__)); + valid_key = ParseInternalKey(key_, &ikey_); + // MergeUntil stops when it encounters a corrupt key and does not + // include them in the result, so we expect the keys here to valid. + assert(valid_key); + if (!valid_key) { + ROCKS_LOG_FATAL(info_log_, "Invalid key (%s) in compaction", + key_.ToString(true).c_str()); + } + // Keep current_key_ in sync. + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + key_ = current_key_.GetInternalKey(); + ikey_.user_key = current_key_.GetUserKey(); + valid_ = true; + } else { + // all merge operands were filtered out. reset the user key, since the + // batch consumed by the merge operator should not shadow any keys + // coming after the merges + has_current_user_key_ = false; + pinned_iters_mgr_.ReleasePinnedData(); + + if (merge_helper_->FilteredUntil(&skip_until)) { + need_skip = true; + } + } + } else { + // 1. new user key -OR- + // 2. different snapshot stripe + bool should_delete = range_del_agg_->ShouldDelete( + key_, RangeDelPositioningMode::kForwardTraversal); + if (should_delete) { + ++iter_stats_.num_record_drop_hidden; + ++iter_stats_.num_record_drop_range_del; + input_->Next(); + } else { + valid_ = true; + } + } + + if (need_skip) { + input_->Seek(skip_until); + } + } + + if (!valid_ && IsShuttingDown()) { + status_ = Status::ShutdownInProgress(); + } + + if (IsPausingManualCompaction()) { + status_ = Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } +} + +void CompactionIterator::PrepareOutput() { + if (valid_) { + if (compaction_filter_ && ikey_.type == kTypeBlobIndex) { + const auto blob_decision = compaction_filter_->PrepareBlobOutput( + user_key(), value_, &compaction_filter_value_); + + if (blob_decision == CompactionFilter::BlobDecision::kCorruption) { + status_ = Status::Corruption( + "Corrupted blob reference encountered during GC"); + valid_ = false; + } else if (blob_decision == CompactionFilter::BlobDecision::kIOError) { + status_ = Status::IOError("Could not relocate blob during GC"); + valid_ = false; + } else if (blob_decision == + CompactionFilter::BlobDecision::kChangeValue) { + value_ = compaction_filter_value_; + } + } + + // Zeroing out the sequence number leads to better compression. + // If this is the bottommost level (no files in lower levels) + // and the earliest snapshot is larger than this seqno + // and the userkey differs from the last userkey in compaction + // then we can squash the seqno to zero. + // + // This is safe for TransactionDB write-conflict checking since transactions + // only care about sequence number larger than any active snapshots. + // + // Can we do the same for levels above bottom level as long as + // KeyNotExistsBeyondOutputLevel() return true? + if (valid_ && compaction_ != nullptr && + !compaction_->allow_ingest_behind() && + ikeyNotNeededForIncrementalSnapshot() && bottommost_level_ && + IN_EARLIEST_SNAPSHOT(ikey_.sequence) && ikey_.type != kTypeMerge) { + assert(ikey_.type != kTypeDeletion && ikey_.type != kTypeSingleDeletion); + if (ikey_.type == kTypeDeletion || ikey_.type == kTypeSingleDeletion) { + ROCKS_LOG_FATAL(info_log_, + "Unexpected key type %d for seq-zero optimization", + ikey_.type); + } + ikey_.sequence = 0; + current_key_.UpdateInternalKey(0, ikey_.type); + } + } +} + +inline SequenceNumber CompactionIterator::findEarliestVisibleSnapshot( + SequenceNumber in, SequenceNumber* prev_snapshot) { + assert(snapshots_->size()); + if (snapshots_->size() == 0) { + ROCKS_LOG_FATAL(info_log_, + "No snapshot left in findEarliestVisibleSnapshot"); + } + auto snapshots_iter = std::lower_bound( + snapshots_->begin(), snapshots_->end(), in); + if (snapshots_iter == snapshots_->begin()) { + *prev_snapshot = 0; + } else { + *prev_snapshot = *std::prev(snapshots_iter); + assert(*prev_snapshot < in); + if (*prev_snapshot >= in) { + ROCKS_LOG_FATAL(info_log_, + "*prev_snapshot >= in in findEarliestVisibleSnapshot"); + } + } + if (snapshot_checker_ == nullptr) { + return snapshots_iter != snapshots_->end() + ? *snapshots_iter : kMaxSequenceNumber; + } + bool has_released_snapshot = !released_snapshots_.empty(); + for (; snapshots_iter != snapshots_->end(); ++snapshots_iter) { + auto cur = *snapshots_iter; + assert(in <= cur); + if (in > cur) { + ROCKS_LOG_FATAL(info_log_, "in > cur in findEarliestVisibleSnapshot"); + } + // Skip if cur is in released_snapshots. + if (has_released_snapshot && released_snapshots_.count(cur) > 0) { + continue; + } + auto res = snapshot_checker_->CheckInSnapshot(in, cur); + if (res == SnapshotCheckerResult::kInSnapshot) { + return cur; + } else if (res == SnapshotCheckerResult::kSnapshotReleased) { + released_snapshots_.insert(cur); + } + *prev_snapshot = cur; + } + return kMaxSequenceNumber; +} + +// used in 2 places - prevents deletion markers to be dropped if they may be +// needed and disables seqnum zero-out in PrepareOutput for recent keys. +inline bool CompactionIterator::ikeyNotNeededForIncrementalSnapshot() { + return (!compaction_->preserve_deletes()) || + (ikey_.sequence < preserve_deletes_seqnum_); +} + +bool CompactionIterator::IsInEarliestSnapshot(SequenceNumber sequence) { + assert(snapshot_checker_ != nullptr); + bool pre_condition = (earliest_snapshot_ == kMaxSequenceNumber || + (earliest_snapshot_iter_ != snapshots_->end() && + *earliest_snapshot_iter_ == earliest_snapshot_)); + assert(pre_condition); + if (!pre_condition) { + ROCKS_LOG_FATAL(info_log_, + "Pre-Condition is not hold in IsInEarliestSnapshot"); + } + auto in_snapshot = + snapshot_checker_->CheckInSnapshot(sequence, earliest_snapshot_); + while (UNLIKELY(in_snapshot == SnapshotCheckerResult::kSnapshotReleased)) { + // Avoid the the current earliest_snapshot_ being return as + // earliest visible snapshot for the next value. So if a value's sequence + // is zero-ed out by PrepareOutput(), the next value will be compact out. + released_snapshots_.insert(earliest_snapshot_); + earliest_snapshot_iter_++; + + if (earliest_snapshot_iter_ == snapshots_->end()) { + earliest_snapshot_ = kMaxSequenceNumber; + } else { + earliest_snapshot_ = *earliest_snapshot_iter_; + } + in_snapshot = + snapshot_checker_->CheckInSnapshot(sequence, earliest_snapshot_); + } + assert(in_snapshot != SnapshotCheckerResult::kSnapshotReleased); + if (in_snapshot == SnapshotCheckerResult::kSnapshotReleased) { + ROCKS_LOG_FATAL(info_log_, + "Unexpected released snapshot in IsInEarliestSnapshot"); + } + return in_snapshot == SnapshotCheckerResult::kInSnapshot; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_iterator.h b/src/rocksdb/db/compaction/compaction_iterator.h new file mode 100644 index 000000000..8be60eb9e --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_iterator.h @@ -0,0 +1,240 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include +#include +#include +#include +#include + +#include "db/compaction/compaction.h" +#include "db/compaction/compaction_iteration_stats.h" +#include "db/merge_helper.h" +#include "db/pinned_iterators_manager.h" +#include "db/range_del_aggregator.h" +#include "db/snapshot_checker.h" +#include "options/cf_options.h" +#include "rocksdb/compaction_filter.h" + +namespace ROCKSDB_NAMESPACE { + +class CompactionIterator { + public: + // A wrapper around Compaction. Has a much smaller interface, only what + // CompactionIterator uses. Tests can override it. + class CompactionProxy { + public: + explicit CompactionProxy(const Compaction* compaction) + : compaction_(compaction) {} + + virtual ~CompactionProxy() = default; + virtual int level(size_t /*compaction_input_level*/ = 0) const { + return compaction_->level(); + } + virtual bool KeyNotExistsBeyondOutputLevel( + const Slice& user_key, std::vector* level_ptrs) const { + return compaction_->KeyNotExistsBeyondOutputLevel(user_key, level_ptrs); + } + virtual bool bottommost_level() const { + return compaction_->bottommost_level(); + } + virtual int number_levels() const { return compaction_->number_levels(); } + virtual Slice GetLargestUserKey() const { + return compaction_->GetLargestUserKey(); + } + virtual bool allow_ingest_behind() const { + return compaction_->immutable_cf_options()->allow_ingest_behind; + } + virtual bool preserve_deletes() const { + return compaction_->immutable_cf_options()->preserve_deletes; + } + + protected: + CompactionProxy() = default; + + private: + const Compaction* compaction_; + }; + + CompactionIterator( + InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, + SequenceNumber last_sequence, std::vector* snapshots, + SequenceNumber earliest_write_conflict_snapshot, + const SnapshotChecker* snapshot_checker, Env* env, + bool report_detailed_time, bool expect_valid_internal_key, + CompactionRangeDelAggregator* range_del_agg, + const Compaction* compaction = nullptr, + const CompactionFilter* compaction_filter = nullptr, + const std::atomic* shutting_down = nullptr, + const SequenceNumber preserve_deletes_seqnum = 0, + const std::atomic* manual_compaction_paused = nullptr, + const std::shared_ptr info_log = nullptr); + + // Constructor with custom CompactionProxy, used for tests. + CompactionIterator( + InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, + SequenceNumber last_sequence, std::vector* snapshots, + SequenceNumber earliest_write_conflict_snapshot, + const SnapshotChecker* snapshot_checker, Env* env, + bool report_detailed_time, bool expect_valid_internal_key, + CompactionRangeDelAggregator* range_del_agg, + std::unique_ptr compaction, + const CompactionFilter* compaction_filter = nullptr, + const std::atomic* shutting_down = nullptr, + const SequenceNumber preserve_deletes_seqnum = 0, + const std::atomic* manual_compaction_paused = nullptr, + const std::shared_ptr info_log = nullptr); + + ~CompactionIterator(); + + void ResetRecordCounts(); + + // Seek to the beginning of the compaction iterator output. + // + // REQUIRED: Call only once. + void SeekToFirst(); + + // Produces the next record in the compaction. + // + // REQUIRED: SeekToFirst() has been called. + void Next(); + + // Getters + const Slice& key() const { return key_; } + const Slice& value() const { return value_; } + const Status& status() const { return status_; } + const ParsedInternalKey& ikey() const { return ikey_; } + bool Valid() const { return valid_; } + const Slice& user_key() const { return current_user_key_; } + const CompactionIterationStats& iter_stats() const { return iter_stats_; } + + private: + // Processes the input stream to find the next output + void NextFromInput(); + + // Do last preparations before presenting the output to the callee. At this + // point this only zeroes out the sequence number if possible for better + // compression. + void PrepareOutput(); + + // Invoke compaction filter if needed. + void InvokeFilterIfNeeded(bool* need_skip, Slice* skip_until); + + // Given a sequence number, return the sequence number of the + // earliest snapshot that this sequence number is visible in. + // The snapshots themselves are arranged in ascending order of + // sequence numbers. + // Employ a sequential search because the total number of + // snapshots are typically small. + inline SequenceNumber findEarliestVisibleSnapshot( + SequenceNumber in, SequenceNumber* prev_snapshot); + + // Checks whether the currently seen ikey_ is needed for + // incremental (differential) snapshot and hence can't be dropped + // or seqnum be zero-ed out even if all other conditions for it are met. + inline bool ikeyNotNeededForIncrementalSnapshot(); + + inline bool KeyCommitted(SequenceNumber sequence) { + return snapshot_checker_ == nullptr || + snapshot_checker_->CheckInSnapshot(sequence, kMaxSequenceNumber) == + SnapshotCheckerResult::kInSnapshot; + } + + bool IsInEarliestSnapshot(SequenceNumber sequence); + + InternalIterator* input_; + const Comparator* cmp_; + MergeHelper* merge_helper_; + const std::vector* snapshots_; + // List of snapshots released during compaction. + // findEarliestVisibleSnapshot() find them out from return of + // snapshot_checker, and make sure they will not be returned as + // earliest visible snapshot of an older value. + // See WritePreparedTransactionTest::ReleaseSnapshotDuringCompaction3. + std::unordered_set released_snapshots_; + std::vector::const_iterator earliest_snapshot_iter_; + const SequenceNumber earliest_write_conflict_snapshot_; + const SnapshotChecker* const snapshot_checker_; + Env* env_; + bool report_detailed_time_; + bool expect_valid_internal_key_; + CompactionRangeDelAggregator* range_del_agg_; + std::unique_ptr compaction_; + const CompactionFilter* compaction_filter_; + const std::atomic* shutting_down_; + const std::atomic* manual_compaction_paused_; + const SequenceNumber preserve_deletes_seqnum_; + bool bottommost_level_; + bool valid_ = false; + bool visible_at_tip_; + SequenceNumber earliest_snapshot_; + SequenceNumber latest_snapshot_; + + // State + // + // Points to a copy of the current compaction iterator output (current_key_) + // if valid_. + Slice key_; + // Points to the value in the underlying iterator that corresponds to the + // current output. + Slice value_; + // The status is OK unless compaction iterator encounters a merge operand + // while not having a merge operator defined. + Status status_; + // Stores the user key, sequence number and type of the current compaction + // iterator output (or current key in the underlying iterator during + // NextFromInput()). + ParsedInternalKey ikey_; + // Stores whether ikey_.user_key is valid. If set to false, the user key is + // not compared against the current key in the underlying iterator. + bool has_current_user_key_ = false; + bool at_next_ = false; // If false, the iterator + // Holds a copy of the current compaction iterator output (or current key in + // the underlying iterator during NextFromInput()). + IterKey current_key_; + Slice current_user_key_; + SequenceNumber current_user_key_sequence_; + SequenceNumber current_user_key_snapshot_; + + // True if the iterator has already returned a record for the current key. + bool has_outputted_key_ = false; + + // truncated the value of the next key and output it without applying any + // compaction rules. This is used for outputting a put after a single delete. + bool clear_and_output_next_key_ = false; + + MergeOutputIterator merge_out_iter_; + // PinnedIteratorsManager used to pin input_ Iterator blocks while reading + // merge operands and then releasing them after consuming them. + PinnedIteratorsManager pinned_iters_mgr_; + std::string compaction_filter_value_; + InternalKey compaction_filter_skip_until_; + // "level_ptrs" holds indices that remember which file of an associated + // level we were last checking during the last call to compaction-> + // KeyNotExistsBeyondOutputLevel(). This allows future calls to the function + // to pick off where it left off since each subcompaction's key range is + // increasing so a later call to the function must be looking for a key that + // is in or beyond the last file checked during the previous call + std::vector level_ptrs_; + CompactionIterationStats iter_stats_; + + // Used to avoid purging uncommitted values. The application can specify + // uncommitted values by providing a SnapshotChecker object. + bool current_key_committed_; + std::shared_ptr info_log_; + + bool IsShuttingDown() { + // This is a best-effort facility, so memory_order_relaxed is sufficient. + return shutting_down_ && shutting_down_->load(std::memory_order_relaxed); + } + + bool IsPausingManualCompaction() { + // This is a best-effort facility, so memory_order_relaxed is sufficient. + return manual_compaction_paused_ && + manual_compaction_paused_->load(std::memory_order_relaxed); + } +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_iterator_test.cc b/src/rocksdb/db/compaction/compaction_iterator_test.cc new file mode 100644 index 000000000..0c50fb9ba --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_iterator_test.cc @@ -0,0 +1,976 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#include +#include + +#include "db/compaction/compaction_iterator.h" +#include "port/port.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/string_util.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { + +// Expects no merging attempts. +class NoMergingMergeOp : public MergeOperator { + public: + bool FullMergeV2(const MergeOperationInput& /*merge_in*/, + MergeOperationOutput* /*merge_out*/) const override { + ADD_FAILURE(); + return false; + } + bool PartialMergeMulti(const Slice& /*key*/, + const std::deque& /*operand_list*/, + std::string* /*new_value*/, + Logger* /*logger*/) const override { + ADD_FAILURE(); + return false; + } + const char* Name() const override { + return "CompactionIteratorTest NoMergingMergeOp"; + } +}; + +// Compaction filter that gets stuck when it sees a particular key, +// then gets unstuck when told to. +// Always returns Decition::kRemove. +class StallingFilter : public CompactionFilter { + public: + Decision FilterV2(int /*level*/, const Slice& key, ValueType /*type*/, + const Slice& /*existing_value*/, std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + int k = std::atoi(key.ToString().c_str()); + last_seen.store(k); + while (k >= stall_at.load()) { + std::this_thread::yield(); + } + return Decision::kRemove; + } + + const char* Name() const override { + return "CompactionIteratorTest StallingFilter"; + } + + // Wait until the filter sees a key >= k and stalls at that key. + // If `exact`, asserts that the seen key is equal to k. + void WaitForStall(int k, bool exact = true) { + stall_at.store(k); + while (last_seen.load() < k) { + std::this_thread::yield(); + } + if (exact) { + EXPECT_EQ(k, last_seen.load()); + } + } + + // Filter will stall on key >= stall_at. Advance stall_at to unstall. + mutable std::atomic stall_at{0}; + // Last key the filter was called with. + mutable std::atomic last_seen{0}; +}; + +// Compaction filter that filter out all keys. +class FilterAllKeysCompactionFilter : public CompactionFilter { + public: + Decision FilterV2(int /*level*/, const Slice& /*key*/, ValueType /*type*/, + const Slice& /*existing_value*/, std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + return Decision::kRemove; + } + + const char* Name() const override { return "AllKeysCompactionFilter"; } +}; + +class LoggingForwardVectorIterator : public InternalIterator { + public: + struct Action { + enum class Type { + SEEK_TO_FIRST, + SEEK, + NEXT, + }; + + Type type; + std::string arg; + + explicit Action(Type _type, std::string _arg = "") + : type(_type), arg(_arg) {} + + bool operator==(const Action& rhs) const { + return std::tie(type, arg) == std::tie(rhs.type, rhs.arg); + } + }; + + LoggingForwardVectorIterator(const std::vector& keys, + const std::vector& values) + : keys_(keys), values_(values), current_(keys.size()) { + assert(keys_.size() == values_.size()); + } + + bool Valid() const override { return current_ < keys_.size(); } + + void SeekToFirst() override { + log.emplace_back(Action::Type::SEEK_TO_FIRST); + current_ = 0; + } + void SeekToLast() override { assert(false); } + + void Seek(const Slice& target) override { + log.emplace_back(Action::Type::SEEK, target.ToString()); + current_ = std::lower_bound(keys_.begin(), keys_.end(), target.ToString()) - + keys_.begin(); + } + + void SeekForPrev(const Slice& /*target*/) override { assert(false); } + + void Next() override { + assert(Valid()); + log.emplace_back(Action::Type::NEXT); + current_++; + } + void Prev() override { assert(false); } + + Slice key() const override { + assert(Valid()); + return Slice(keys_[current_]); + } + Slice value() const override { + assert(Valid()); + return Slice(values_[current_]); + } + + Status status() const override { return Status::OK(); } + + std::vector log; + + private: + std::vector keys_; + std::vector values_; + size_t current_; +}; + +class FakeCompaction : public CompactionIterator::CompactionProxy { + public: + FakeCompaction() = default; + + int level(size_t /*compaction_input_level*/) const override { return 0; } + bool KeyNotExistsBeyondOutputLevel( + const Slice& /*user_key*/, + std::vector* /*level_ptrs*/) const override { + return is_bottommost_level || key_not_exists_beyond_output_level; + } + bool bottommost_level() const override { return is_bottommost_level; } + int number_levels() const override { return 1; } + Slice GetLargestUserKey() const override { + return "\xff\xff\xff\xff\xff\xff\xff\xff\xff"; + } + bool allow_ingest_behind() const override { return false; } + + bool preserve_deletes() const override { return false; } + + bool key_not_exists_beyond_output_level = false; + + bool is_bottommost_level = false; +}; + +// A simplifed snapshot checker which assumes each snapshot has a global +// last visible sequence. +class TestSnapshotChecker : public SnapshotChecker { + public: + explicit TestSnapshotChecker( + SequenceNumber last_committed_sequence, + const std::unordered_map& snapshots = {{}}) + : last_committed_sequence_(last_committed_sequence), + snapshots_(snapshots) {} + + SnapshotCheckerResult CheckInSnapshot( + SequenceNumber seq, SequenceNumber snapshot_seq) const override { + if (snapshot_seq == kMaxSequenceNumber) { + return seq <= last_committed_sequence_ + ? SnapshotCheckerResult::kInSnapshot + : SnapshotCheckerResult::kNotInSnapshot; + } + assert(snapshots_.count(snapshot_seq) > 0); + return seq <= snapshots_.at(snapshot_seq) + ? SnapshotCheckerResult::kInSnapshot + : SnapshotCheckerResult::kNotInSnapshot; + } + + private: + SequenceNumber last_committed_sequence_; + // A map of valid snapshot to last visible sequence to the snapshot. + std::unordered_map snapshots_; +}; + +// Test param: +// bool: whether to pass snapshot_checker to compaction iterator. +class CompactionIteratorTest : public testing::TestWithParam { + public: + CompactionIteratorTest() + : cmp_(BytewiseComparator()), icmp_(cmp_), snapshots_({}) {} + + void InitIterators( + const std::vector& ks, const std::vector& vs, + const std::vector& range_del_ks, + const std::vector& range_del_vs, + SequenceNumber last_sequence, + SequenceNumber last_committed_sequence = kMaxSequenceNumber, + MergeOperator* merge_op = nullptr, CompactionFilter* filter = nullptr, + bool bottommost_level = false, + SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber) { + std::unique_ptr unfragmented_range_del_iter( + new test::VectorIterator(range_del_ks, range_del_vs)); + auto tombstone_list = std::make_shared( + std::move(unfragmented_range_del_iter), icmp_); + std::unique_ptr range_del_iter( + new FragmentedRangeTombstoneIterator(tombstone_list, icmp_, + kMaxSequenceNumber)); + range_del_agg_.reset(new CompactionRangeDelAggregator(&icmp_, snapshots_)); + range_del_agg_->AddTombstones(std::move(range_del_iter)); + + std::unique_ptr compaction; + if (filter || bottommost_level) { + compaction_proxy_ = new FakeCompaction(); + compaction_proxy_->is_bottommost_level = bottommost_level; + compaction.reset(compaction_proxy_); + } + bool use_snapshot_checker = UseSnapshotChecker() || GetParam(); + if (use_snapshot_checker || last_committed_sequence < kMaxSequenceNumber) { + snapshot_checker_.reset( + new TestSnapshotChecker(last_committed_sequence, snapshot_map_)); + } + merge_helper_.reset( + new MergeHelper(Env::Default(), cmp_, merge_op, filter, nullptr, false, + 0 /*latest_snapshot*/, snapshot_checker_.get(), + 0 /*level*/, nullptr /*statistics*/, &shutting_down_)); + + iter_.reset(new LoggingForwardVectorIterator(ks, vs)); + iter_->SeekToFirst(); + c_iter_.reset(new CompactionIterator( + iter_.get(), cmp_, merge_helper_.get(), last_sequence, &snapshots_, + earliest_write_conflict_snapshot, snapshot_checker_.get(), + Env::Default(), false /* report_detailed_time */, false, + range_del_agg_.get(), std::move(compaction), filter, &shutting_down_)); + } + + void AddSnapshot(SequenceNumber snapshot, + SequenceNumber last_visible_seq = kMaxSequenceNumber) { + snapshots_.push_back(snapshot); + snapshot_map_[snapshot] = last_visible_seq; + } + + virtual bool UseSnapshotChecker() const { return false; } + + void RunTest( + const std::vector& input_keys, + const std::vector& input_values, + const std::vector& expected_keys, + const std::vector& expected_values, + SequenceNumber last_committed_seq = kMaxSequenceNumber, + MergeOperator* merge_operator = nullptr, + CompactionFilter* compaction_filter = nullptr, + bool bottommost_level = false, + SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber) { + InitIterators(input_keys, input_values, {}, {}, kMaxSequenceNumber, + last_committed_seq, merge_operator, compaction_filter, + bottommost_level, earliest_write_conflict_snapshot); + c_iter_->SeekToFirst(); + for (size_t i = 0; i < expected_keys.size(); i++) { + std::string info = "i = " + ToString(i); + ASSERT_TRUE(c_iter_->Valid()) << info; + ASSERT_OK(c_iter_->status()) << info; + ASSERT_EQ(expected_keys[i], c_iter_->key().ToString()) << info; + ASSERT_EQ(expected_values[i], c_iter_->value().ToString()) << info; + c_iter_->Next(); + } + ASSERT_FALSE(c_iter_->Valid()); + } + + const Comparator* cmp_; + const InternalKeyComparator icmp_; + std::vector snapshots_; + // A map of valid snapshot to last visible sequence to the snapshot. + std::unordered_map snapshot_map_; + std::unique_ptr merge_helper_; + std::unique_ptr iter_; + std::unique_ptr c_iter_; + std::unique_ptr range_del_agg_; + std::unique_ptr snapshot_checker_; + std::atomic shutting_down_{false}; + FakeCompaction* compaction_proxy_; +}; + +// It is possible that the output of the compaction iterator is empty even if +// the input is not. +TEST_P(CompactionIteratorTest, EmptyResult) { + InitIterators({test::KeyStr("a", 5, kTypeSingleDeletion), + test::KeyStr("a", 3, kTypeValue)}, + {"", "val"}, {}, {}, 5); + c_iter_->SeekToFirst(); + ASSERT_FALSE(c_iter_->Valid()); +} + +// If there is a corruption after a single deletion, the corrupted key should +// be preserved. +TEST_P(CompactionIteratorTest, CorruptionAfterSingleDeletion) { + InitIterators({test::KeyStr("a", 5, kTypeSingleDeletion), + test::KeyStr("a", 3, kTypeValue, true), + test::KeyStr("b", 10, kTypeValue)}, + {"", "val", "val2"}, {}, {}, 10); + c_iter_->SeekToFirst(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("a", 5, kTypeSingleDeletion), + c_iter_->key().ToString()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("a", 3, kTypeValue, true), c_iter_->key().ToString()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("b", 10, kTypeValue), c_iter_->key().ToString()); + c_iter_->Next(); + ASSERT_FALSE(c_iter_->Valid()); +} + +TEST_P(CompactionIteratorTest, SimpleRangeDeletion) { + InitIterators({test::KeyStr("morning", 5, kTypeValue), + test::KeyStr("morning", 2, kTypeValue), + test::KeyStr("night", 3, kTypeValue)}, + {"zao", "zao", "wan"}, + {test::KeyStr("ma", 4, kTypeRangeDeletion)}, {"mz"}, 5); + c_iter_->SeekToFirst(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("morning", 5, kTypeValue), c_iter_->key().ToString()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("night", 3, kTypeValue), c_iter_->key().ToString()); + c_iter_->Next(); + ASSERT_FALSE(c_iter_->Valid()); +} + +TEST_P(CompactionIteratorTest, RangeDeletionWithSnapshots) { + AddSnapshot(10); + std::vector ks1; + ks1.push_back(test::KeyStr("ma", 28, kTypeRangeDeletion)); + std::vector vs1{"mz"}; + std::vector ks2{test::KeyStr("morning", 15, kTypeValue), + test::KeyStr("morning", 5, kTypeValue), + test::KeyStr("night", 40, kTypeValue), + test::KeyStr("night", 20, kTypeValue)}; + std::vector vs2{"zao 15", "zao 5", "wan 40", "wan 20"}; + InitIterators(ks2, vs2, ks1, vs1, 40); + c_iter_->SeekToFirst(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("morning", 5, kTypeValue), c_iter_->key().ToString()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("night", 40, kTypeValue), c_iter_->key().ToString()); + c_iter_->Next(); + ASSERT_FALSE(c_iter_->Valid()); +} + +TEST_P(CompactionIteratorTest, CompactionFilterSkipUntil) { + class Filter : public CompactionFilter { + Decision FilterV2(int /*level*/, const Slice& key, ValueType t, + const Slice& existing_value, std::string* /*new_value*/, + std::string* skip_until) const override { + std::string k = key.ToString(); + std::string v = existing_value.ToString(); + // See InitIterators() call below for the sequence of keys and their + // filtering decisions. Here we closely assert that compaction filter is + // called with the expected keys and only them, and with the right values. + if (k == "a") { + EXPECT_EQ(ValueType::kValue, t); + EXPECT_EQ("av50", v); + return Decision::kKeep; + } + if (k == "b") { + EXPECT_EQ(ValueType::kValue, t); + EXPECT_EQ("bv60", v); + *skip_until = "d+"; + return Decision::kRemoveAndSkipUntil; + } + if (k == "e") { + EXPECT_EQ(ValueType::kMergeOperand, t); + EXPECT_EQ("em71", v); + return Decision::kKeep; + } + if (k == "f") { + if (v == "fm65") { + EXPECT_EQ(ValueType::kMergeOperand, t); + *skip_until = "f"; + } else { + EXPECT_EQ("fm30", v); + EXPECT_EQ(ValueType::kMergeOperand, t); + *skip_until = "g+"; + } + return Decision::kRemoveAndSkipUntil; + } + if (k == "h") { + EXPECT_EQ(ValueType::kValue, t); + EXPECT_EQ("hv91", v); + return Decision::kKeep; + } + if (k == "i") { + EXPECT_EQ(ValueType::kMergeOperand, t); + EXPECT_EQ("im95", v); + *skip_until = "z"; + return Decision::kRemoveAndSkipUntil; + } + ADD_FAILURE(); + return Decision::kKeep; + } + + const char* Name() const override { + return "CompactionIteratorTest.CompactionFilterSkipUntil::Filter"; + } + }; + + NoMergingMergeOp merge_op; + Filter filter; + InitIterators( + {test::KeyStr("a", 50, kTypeValue), // keep + test::KeyStr("a", 45, kTypeMerge), + test::KeyStr("b", 60, kTypeValue), // skip to "d+" + test::KeyStr("b", 40, kTypeValue), test::KeyStr("c", 35, kTypeValue), + test::KeyStr("d", 70, kTypeMerge), + test::KeyStr("e", 71, kTypeMerge), // keep + test::KeyStr("f", 65, kTypeMerge), // skip to "f", aka keep + test::KeyStr("f", 30, kTypeMerge), // skip to "g+" + test::KeyStr("f", 25, kTypeValue), test::KeyStr("g", 90, kTypeValue), + test::KeyStr("h", 91, kTypeValue), // keep + test::KeyStr("i", 95, kTypeMerge), // skip to "z" + test::KeyStr("j", 99, kTypeValue)}, + {"av50", "am45", "bv60", "bv40", "cv35", "dm70", "em71", "fm65", "fm30", + "fv25", "gv90", "hv91", "im95", "jv99"}, + {}, {}, kMaxSequenceNumber, kMaxSequenceNumber, &merge_op, &filter); + + // Compaction should output just "a", "e" and "h" keys. + c_iter_->SeekToFirst(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("a", 50, kTypeValue), c_iter_->key().ToString()); + ASSERT_EQ("av50", c_iter_->value().ToString()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("e", 71, kTypeMerge), c_iter_->key().ToString()); + ASSERT_EQ("em71", c_iter_->value().ToString()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("h", 91, kTypeValue), c_iter_->key().ToString()); + ASSERT_EQ("hv91", c_iter_->value().ToString()); + c_iter_->Next(); + ASSERT_FALSE(c_iter_->Valid()); + + // Check that the compaction iterator did the correct sequence of calls on + // the underlying iterator. + using A = LoggingForwardVectorIterator::Action; + using T = A::Type; + std::vector expected_actions = { + A(T::SEEK_TO_FIRST), + A(T::NEXT), + A(T::NEXT), + A(T::SEEK, test::KeyStr("d+", kMaxSequenceNumber, kValueTypeForSeek)), + A(T::NEXT), + A(T::NEXT), + A(T::SEEK, test::KeyStr("g+", kMaxSequenceNumber, kValueTypeForSeek)), + A(T::NEXT), + A(T::SEEK, test::KeyStr("z", kMaxSequenceNumber, kValueTypeForSeek))}; + ASSERT_EQ(expected_actions, iter_->log); +} + +TEST_P(CompactionIteratorTest, ShuttingDownInFilter) { + NoMergingMergeOp merge_op; + StallingFilter filter; + InitIterators( + {test::KeyStr("1", 1, kTypeValue), test::KeyStr("2", 2, kTypeValue), + test::KeyStr("3", 3, kTypeValue), test::KeyStr("4", 4, kTypeValue)}, + {"v1", "v2", "v3", "v4"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber, + &merge_op, &filter); + // Don't leave tombstones (kTypeDeletion) for filtered keys. + compaction_proxy_->key_not_exists_beyond_output_level = true; + + std::atomic seek_done{false}; + ROCKSDB_NAMESPACE::port::Thread compaction_thread([&] { + c_iter_->SeekToFirst(); + EXPECT_FALSE(c_iter_->Valid()); + EXPECT_TRUE(c_iter_->status().IsShutdownInProgress()); + seek_done.store(true); + }); + + // Let key 1 through. + filter.WaitForStall(1); + + // Shutdown during compaction filter call for key 2. + filter.WaitForStall(2); + shutting_down_.store(true); + EXPECT_FALSE(seek_done.load()); + + // Unstall filter and wait for SeekToFirst() to return. + filter.stall_at.store(3); + compaction_thread.join(); + assert(seek_done.load()); + + // Check that filter was never called again. + EXPECT_EQ(2, filter.last_seen.load()); +} + +// Same as ShuttingDownInFilter, but shutdown happens during filter call for +// a merge operand, not for a value. +TEST_P(CompactionIteratorTest, ShuttingDownInMerge) { + NoMergingMergeOp merge_op; + StallingFilter filter; + InitIterators( + {test::KeyStr("1", 1, kTypeValue), test::KeyStr("2", 2, kTypeMerge), + test::KeyStr("3", 3, kTypeMerge), test::KeyStr("4", 4, kTypeValue)}, + {"v1", "v2", "v3", "v4"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber, + &merge_op, &filter); + compaction_proxy_->key_not_exists_beyond_output_level = true; + + std::atomic seek_done{false}; + ROCKSDB_NAMESPACE::port::Thread compaction_thread([&] { + c_iter_->SeekToFirst(); + ASSERT_FALSE(c_iter_->Valid()); + ASSERT_TRUE(c_iter_->status().IsShutdownInProgress()); + seek_done.store(true); + }); + + // Let key 1 through. + filter.WaitForStall(1); + + // Shutdown during compaction filter call for key 2. + filter.WaitForStall(2); + shutting_down_.store(true); + EXPECT_FALSE(seek_done.load()); + + // Unstall filter and wait for SeekToFirst() to return. + filter.stall_at.store(3); + compaction_thread.join(); + assert(seek_done.load()); + + // Check that filter was never called again. + EXPECT_EQ(2, filter.last_seen.load()); +} + +TEST_P(CompactionIteratorTest, SingleMergeOperand) { + class Filter : public CompactionFilter { + Decision FilterV2(int /*level*/, const Slice& key, ValueType t, + const Slice& existing_value, std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + std::string k = key.ToString(); + std::string v = existing_value.ToString(); + + // See InitIterators() call below for the sequence of keys and their + // filtering decisions. Here we closely assert that compaction filter is + // called with the expected keys and only them, and with the right values. + if (k == "a") { + EXPECT_EQ(ValueType::kMergeOperand, t); + EXPECT_EQ("av1", v); + return Decision::kKeep; + } else if (k == "b") { + EXPECT_EQ(ValueType::kMergeOperand, t); + return Decision::kKeep; + } else if (k == "c") { + return Decision::kKeep; + } + + ADD_FAILURE(); + return Decision::kKeep; + } + + const char* Name() const override { + return "CompactionIteratorTest.SingleMergeOperand::Filter"; + } + }; + + class SingleMergeOp : public MergeOperator { + public: + bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override { + // See InitIterators() call below for why "c" is the only key for which + // FullMergeV2 should be called. + EXPECT_EQ("c", merge_in.key.ToString()); + + std::string temp_value; + if (merge_in.existing_value != nullptr) { + temp_value = merge_in.existing_value->ToString(); + } + + for (auto& operand : merge_in.operand_list) { + temp_value.append(operand.ToString()); + } + merge_out->new_value = temp_value; + + return true; + } + + bool PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, + Logger* /*logger*/) const override { + std::string string_key = key.ToString(); + EXPECT_TRUE(string_key == "a" || string_key == "b"); + + if (string_key == "a") { + EXPECT_EQ(1, operand_list.size()); + } else if (string_key == "b") { + EXPECT_EQ(2, operand_list.size()); + } + + std::string temp_value; + for (auto& operand : operand_list) { + temp_value.append(operand.ToString()); + } + swap(temp_value, *new_value); + + return true; + } + + const char* Name() const override { + return "CompactionIteratorTest SingleMergeOp"; + } + + bool AllowSingleOperand() const override { return true; } + }; + + SingleMergeOp merge_op; + Filter filter; + InitIterators( + // a should invoke PartialMergeMulti with a single merge operand. + {test::KeyStr("a", 50, kTypeMerge), + // b should invoke PartialMergeMulti with two operands. + test::KeyStr("b", 70, kTypeMerge), test::KeyStr("b", 60, kTypeMerge), + // c should invoke FullMerge due to kTypeValue at the beginning. + test::KeyStr("c", 90, kTypeMerge), test::KeyStr("c", 80, kTypeValue)}, + {"av1", "bv2", "bv1", "cv2", "cv1"}, {}, {}, kMaxSequenceNumber, + kMaxSequenceNumber, &merge_op, &filter); + + c_iter_->SeekToFirst(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), c_iter_->key().ToString()); + ASSERT_EQ("av1", c_iter_->value().ToString()); + c_iter_->Next(); + ASSERT_TRUE(c_iter_->Valid()); + ASSERT_EQ("bv1bv2", c_iter_->value().ToString()); + c_iter_->Next(); + ASSERT_EQ("cv1cv2", c_iter_->value().ToString()); +} + +// In bottommost level, values earlier than earliest snapshot can be output +// with sequence = 0. +TEST_P(CompactionIteratorTest, ZeroOutSequenceAtBottomLevel) { + AddSnapshot(1); + RunTest({test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 2, kTypeValue)}, + {"v1", "v2"}, + {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 2, kTypeValue)}, + {"v1", "v2"}, kMaxSequenceNumber /*last_commited_seq*/, + nullptr /*merge_operator*/, nullptr /*compaction_filter*/, + true /*bottommost_level*/); +} + +// In bottommost level, deletions earlier than earliest snapshot can be removed +// permanently. +TEST_P(CompactionIteratorTest, RemoveDeletionAtBottomLevel) { + AddSnapshot(1); + RunTest({test::KeyStr("a", 1, kTypeDeletion), + test::KeyStr("b", 3, kTypeDeletion), + test::KeyStr("b", 1, kTypeValue)}, + {"", "", ""}, + {test::KeyStr("b", 3, kTypeDeletion), + test::KeyStr("b", 0, kTypeValue)}, + {"", ""}, + kMaxSequenceNumber /*last_commited_seq*/, nullptr /*merge_operator*/, + nullptr /*compaction_filter*/, true /*bottommost_level*/); +} + +// In bottommost level, single deletions earlier than earliest snapshot can be +// removed permanently. +TEST_P(CompactionIteratorTest, RemoveSingleDeletionAtBottomLevel) { + AddSnapshot(1); + RunTest({test::KeyStr("a", 1, kTypeSingleDeletion), + test::KeyStr("b", 2, kTypeSingleDeletion)}, + {"", ""}, {test::KeyStr("b", 2, kTypeSingleDeletion)}, {""}, + kMaxSequenceNumber /*last_commited_seq*/, nullptr /*merge_operator*/, + nullptr /*compaction_filter*/, true /*bottommost_level*/); +} + +INSTANTIATE_TEST_CASE_P(CompactionIteratorTestInstance, CompactionIteratorTest, + testing::Values(true, false)); + +// Tests how CompactionIterator work together with SnapshotChecker. +class CompactionIteratorWithSnapshotCheckerTest + : public CompactionIteratorTest { + public: + bool UseSnapshotChecker() const override { return true; } +}; + +// Uncommitted keys (keys with seq > last_committed_seq) should be output as-is +// while committed version of these keys should get compacted as usual. + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + PreserveUncommittedKeys_Value) { + RunTest( + {test::KeyStr("foo", 3, kTypeValue), test::KeyStr("foo", 2, kTypeValue), + test::KeyStr("foo", 1, kTypeValue)}, + {"v3", "v2", "v1"}, + {test::KeyStr("foo", 3, kTypeValue), test::KeyStr("foo", 2, kTypeValue)}, + {"v3", "v2"}, 2 /*last_committed_seq*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + PreserveUncommittedKeys_Deletion) { + RunTest({test::KeyStr("foo", 2, kTypeDeletion), + test::KeyStr("foo", 1, kTypeValue)}, + {"", "v1"}, + {test::KeyStr("foo", 2, kTypeDeletion), + test::KeyStr("foo", 1, kTypeValue)}, + {"", "v1"}, 1 /*last_committed_seq*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + PreserveUncommittedKeys_Merge) { + auto merge_op = MergeOperators::CreateStringAppendOperator(); + RunTest( + {test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 2, kTypeMerge), + test::KeyStr("foo", 1, kTypeValue)}, + {"v3", "v2", "v1"}, + {test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 2, kTypeValue)}, + {"v3", "v1,v2"}, 2 /*last_committed_seq*/, merge_op.get()); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + PreserveUncommittedKeys_SingleDelete) { + RunTest({test::KeyStr("foo", 2, kTypeSingleDeletion), + test::KeyStr("foo", 1, kTypeValue)}, + {"", "v1"}, + {test::KeyStr("foo", 2, kTypeSingleDeletion), + test::KeyStr("foo", 1, kTypeValue)}, + {"", "v1"}, 1 /*last_committed_seq*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + PreserveUncommittedKeys_BlobIndex) { + RunTest({test::KeyStr("foo", 3, kTypeBlobIndex), + test::KeyStr("foo", 2, kTypeBlobIndex), + test::KeyStr("foo", 1, kTypeBlobIndex)}, + {"v3", "v2", "v1"}, + {test::KeyStr("foo", 3, kTypeBlobIndex), + test::KeyStr("foo", 2, kTypeBlobIndex)}, + {"v3", "v2"}, 2 /*last_committed_seq*/); +} + +// Test compaction iterator dedup keys visible to the same snapshot. + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Value) { + AddSnapshot(2, 1); + RunTest( + {test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 3, kTypeValue), + test::KeyStr("foo", 2, kTypeValue), test::KeyStr("foo", 1, kTypeValue)}, + {"v4", "v3", "v2", "v1"}, + {test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 3, kTypeValue), + test::KeyStr("foo", 1, kTypeValue)}, + {"v4", "v3", "v1"}, 3 /*last_committed_seq*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Deletion) { + AddSnapshot(2, 1); + RunTest( + {test::KeyStr("foo", 4, kTypeValue), + test::KeyStr("foo", 3, kTypeDeletion), + test::KeyStr("foo", 2, kTypeValue), test::KeyStr("foo", 1, kTypeValue)}, + {"v4", "", "v2", "v1"}, + {test::KeyStr("foo", 4, kTypeValue), + test::KeyStr("foo", 3, kTypeDeletion), + test::KeyStr("foo", 1, kTypeValue)}, + {"v4", "", "v1"}, 3 /*last_committed_seq*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Merge) { + AddSnapshot(2, 1); + AddSnapshot(4, 3); + auto merge_op = MergeOperators::CreateStringAppendOperator(); + RunTest( + {test::KeyStr("foo", 5, kTypeMerge), test::KeyStr("foo", 4, kTypeMerge), + test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 2, kTypeMerge), + test::KeyStr("foo", 1, kTypeValue)}, + {"v5", "v4", "v3", "v2", "v1"}, + {test::KeyStr("foo", 5, kTypeMerge), test::KeyStr("foo", 4, kTypeMerge), + test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 1, kTypeValue)}, + {"v5", "v4", "v2,v3", "v1"}, 4 /*last_committed_seq*/, merge_op.get()); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + DedupSameSnapshot_SingleDeletion) { + AddSnapshot(2, 1); + RunTest( + {test::KeyStr("foo", 4, kTypeValue), + test::KeyStr("foo", 3, kTypeSingleDeletion), + test::KeyStr("foo", 2, kTypeValue), test::KeyStr("foo", 1, kTypeValue)}, + {"v4", "", "v2", "v1"}, + {test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 1, kTypeValue)}, + {"v4", "v1"}, 3 /*last_committed_seq*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_BlobIndex) { + AddSnapshot(2, 1); + RunTest({test::KeyStr("foo", 4, kTypeBlobIndex), + test::KeyStr("foo", 3, kTypeBlobIndex), + test::KeyStr("foo", 2, kTypeBlobIndex), + test::KeyStr("foo", 1, kTypeBlobIndex)}, + {"v4", "v3", "v2", "v1"}, + {test::KeyStr("foo", 4, kTypeBlobIndex), + test::KeyStr("foo", 3, kTypeBlobIndex), + test::KeyStr("foo", 1, kTypeBlobIndex)}, + {"v4", "v3", "v1"}, 3 /*last_committed_seq*/); +} + +// At bottom level, sequence numbers can be zero out, and deletions can be +// removed, but only when they are visible to earliest snapshot. + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + NotZeroOutSequenceIfNotVisibleToEarliestSnapshot) { + AddSnapshot(2, 1); + RunTest({test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 2, kTypeValue), + test::KeyStr("c", 3, kTypeValue)}, + {"v1", "v2", "v3"}, + {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 2, kTypeValue), + test::KeyStr("c", 3, kTypeValue)}, + {"v1", "v2", "v3"}, kMaxSequenceNumber /*last_commited_seq*/, + nullptr /*merge_operator*/, nullptr /*compaction_filter*/, + true /*bottommost_level*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + NotRemoveDeletionIfNotVisibleToEarliestSnapshot) { + AddSnapshot(2, 1); + RunTest( + {test::KeyStr("a", 1, kTypeDeletion), test::KeyStr("b", 2, kTypeDeletion), + test::KeyStr("c", 3, kTypeDeletion)}, + {"", "", ""}, + {}, + {"", ""}, kMaxSequenceNumber /*last_commited_seq*/, + nullptr /*merge_operator*/, nullptr /*compaction_filter*/, + true /*bottommost_level*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + NotRemoveDeletionIfValuePresentToEarlierSnapshot) { + AddSnapshot(2,1); + RunTest( + {test::KeyStr("a", 4, kTypeDeletion), test::KeyStr("a", 1, kTypeValue), + test::KeyStr("b", 3, kTypeValue)}, + {"", "", ""}, + {test::KeyStr("a", 4, kTypeDeletion), test::KeyStr("a", 0, kTypeValue), + test::KeyStr("b", 3, kTypeValue)}, + {"", "", ""}, kMaxSequenceNumber /*last_commited_seq*/, + nullptr /*merge_operator*/, nullptr /*compaction_filter*/, + true /*bottommost_level*/); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + NotRemoveSingleDeletionIfNotVisibleToEarliestSnapshot) { + AddSnapshot(2, 1); + RunTest({test::KeyStr("a", 1, kTypeSingleDeletion), + test::KeyStr("b", 2, kTypeSingleDeletion), + test::KeyStr("c", 3, kTypeSingleDeletion)}, + {"", "", ""}, + {test::KeyStr("b", 2, kTypeSingleDeletion), + test::KeyStr("c", 3, kTypeSingleDeletion)}, + {"", ""}, kMaxSequenceNumber /*last_commited_seq*/, + nullptr /*merge_operator*/, nullptr /*compaction_filter*/, + true /*bottommost_level*/); +} + +// Single delete should not cancel out values that not visible to the +// same set of snapshots +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + SingleDeleteAcrossSnapshotBoundary) { + AddSnapshot(2, 1); + RunTest({test::KeyStr("a", 2, kTypeSingleDeletion), + test::KeyStr("a", 1, kTypeValue)}, + {"", "v1"}, + {test::KeyStr("a", 2, kTypeSingleDeletion), + test::KeyStr("a", 1, kTypeValue)}, + {"", "v1"}, 2 /*last_committed_seq*/); +} + +// Single delete should be kept in case it is not visible to the +// earliest write conflict snapshot. If a single delete is kept for this reason, +// corresponding value can be trimmed to save space. +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + KeepSingleDeletionForWriteConflictChecking) { + AddSnapshot(2, 0); + RunTest({test::KeyStr("a", 2, kTypeSingleDeletion), + test::KeyStr("a", 1, kTypeValue)}, + {"", "v1"}, + {test::KeyStr("a", 2, kTypeSingleDeletion), + test::KeyStr("a", 1, kTypeValue)}, + {"", ""}, 2 /*last_committed_seq*/, nullptr /*merge_operator*/, + nullptr /*compaction_filter*/, false /*bottommost_level*/, + 2 /*earliest_write_conflict_snapshot*/); +} + +// Compaction filter should keep uncommitted key as-is, and +// * Convert the latest velue to deletion, and/or +// * if latest value is a merge, apply filter to all suequent merges. + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_Value) { + std::unique_ptr compaction_filter( + new FilterAllKeysCompactionFilter()); + RunTest( + {test::KeyStr("a", 2, kTypeValue), test::KeyStr("a", 1, kTypeValue), + test::KeyStr("b", 3, kTypeValue), test::KeyStr("c", 1, kTypeValue)}, + {"v2", "v1", "v3", "v4"}, + {test::KeyStr("a", 2, kTypeValue), test::KeyStr("a", 1, kTypeDeletion), + test::KeyStr("b", 3, kTypeValue), test::KeyStr("c", 1, kTypeDeletion)}, + {"v2", "", "v3", ""}, 1 /*last_committed_seq*/, + nullptr /*merge_operator*/, compaction_filter.get()); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_Deletion) { + std::unique_ptr compaction_filter( + new FilterAllKeysCompactionFilter()); + RunTest( + {test::KeyStr("a", 2, kTypeDeletion), test::KeyStr("a", 1, kTypeValue)}, + {"", "v1"}, + {test::KeyStr("a", 2, kTypeDeletion), + test::KeyStr("a", 1, kTypeDeletion)}, + {"", ""}, 1 /*last_committed_seq*/, nullptr /*merge_operator*/, + compaction_filter.get()); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + CompactionFilter_PartialMerge) { + std::shared_ptr merge_op = + MergeOperators::CreateStringAppendOperator(); + std::unique_ptr compaction_filter( + new FilterAllKeysCompactionFilter()); + RunTest({test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 2, kTypeMerge), + test::KeyStr("a", 1, kTypeMerge)}, + {"v3", "v2", "v1"}, {test::KeyStr("a", 3, kTypeMerge)}, {"v3"}, + 2 /*last_committed_seq*/, merge_op.get(), compaction_filter.get()); +} + +TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_FullMerge) { + std::shared_ptr merge_op = + MergeOperators::CreateStringAppendOperator(); + std::unique_ptr compaction_filter( + new FilterAllKeysCompactionFilter()); + RunTest( + {test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 2, kTypeMerge), + test::KeyStr("a", 1, kTypeValue)}, + {"v3", "v2", "v1"}, + {test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 1, kTypeDeletion)}, + {"v3", ""}, 2 /*last_committed_seq*/, merge_op.get(), + compaction_filter.get()); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/compaction/compaction_job.cc b/src/rocksdb/db/compaction/compaction_job.cc new file mode 100644 index 000000000..576ec7b45 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_job.cc @@ -0,0 +1,1700 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/builder.h" +#include "db/compaction/compaction_job.h" +#include "db/db_impl/db_impl.h" +#include "db/db_iter.h" +#include "db/dbformat.h" +#include "db/error_handler.h" +#include "db/event_helpers.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/memtable_list.h" +#include "db/merge_context.h" +#include "db/merge_helper.h" +#include "db/range_del_aggregator.h" +#include "db/version_set.h" +#include "file/filename.h" +#include "file/read_write_util.h" +#include "file/sst_file_manager_impl.h" +#include "file/writable_file_writer.h" +#include "logging/log_buffer.h" +#include "logging/logging.h" +#include "monitoring/iostats_context_imp.h" +#include "monitoring/perf_context_imp.h" +#include "monitoring/thread_status_util.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/merging_iterator.h" +#include "table/table_builder.h" +#include "test_util/sync_point.h" +#include "util/coding.h" +#include "util/mutexlock.h" +#include "util/random.h" +#include "util/stop_watch.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +const char* GetCompactionReasonString(CompactionReason compaction_reason) { + switch (compaction_reason) { + case CompactionReason::kUnknown: + return "Unknown"; + case CompactionReason::kLevelL0FilesNum: + return "LevelL0FilesNum"; + case CompactionReason::kLevelMaxLevelSize: + return "LevelMaxLevelSize"; + case CompactionReason::kUniversalSizeAmplification: + return "UniversalSizeAmplification"; + case CompactionReason::kUniversalSizeRatio: + return "UniversalSizeRatio"; + case CompactionReason::kUniversalSortedRunNum: + return "UniversalSortedRunNum"; + case CompactionReason::kFIFOMaxSize: + return "FIFOMaxSize"; + case CompactionReason::kFIFOReduceNumFiles: + return "FIFOReduceNumFiles"; + case CompactionReason::kFIFOTtl: + return "FIFOTtl"; + case CompactionReason::kManualCompaction: + return "ManualCompaction"; + case CompactionReason::kFilesMarkedForCompaction: + return "FilesMarkedForCompaction"; + case CompactionReason::kBottommostFiles: + return "BottommostFiles"; + case CompactionReason::kTtl: + return "Ttl"; + case CompactionReason::kFlush: + return "Flush"; + case CompactionReason::kExternalSstIngestion: + return "ExternalSstIngestion"; + case CompactionReason::kPeriodicCompaction: + return "PeriodicCompaction"; + case CompactionReason::kNumOfReasons: + // fall through + default: + assert(false); + return "Invalid"; + } +} + +// Maintains state for each sub-compaction +struct CompactionJob::SubcompactionState { + const Compaction* compaction; + std::unique_ptr c_iter; + + // The boundaries of the key-range this compaction is interested in. No two + // subcompactions may have overlapping key-ranges. + // 'start' is inclusive, 'end' is exclusive, and nullptr means unbounded + Slice *start, *end; + + // The return status of this subcompaction + Status status; + + // Files produced by this subcompaction + struct Output { + FileMetaData meta; + bool finished; + std::shared_ptr table_properties; + }; + + // State kept for output being generated + std::vector outputs; + std::unique_ptr outfile; + std::unique_ptr builder; + Output* current_output() { + if (outputs.empty()) { + // This subcompaction's outptut could be empty if compaction was aborted + // before this subcompaction had a chance to generate any output files. + // When subcompactions are executed sequentially this is more likely and + // will be particulalry likely for the later subcompactions to be empty. + // Once they are run in parallel however it should be much rarer. + return nullptr; + } else { + return &outputs.back(); + } + } + + uint64_t current_output_file_size; + + // State during the subcompaction + uint64_t total_bytes; + uint64_t num_output_records; + CompactionJobStats compaction_job_stats; + uint64_t approx_size; + // An index that used to speed up ShouldStopBefore(). + size_t grandparent_index = 0; + // The number of bytes overlapping between the current output and + // grandparent files used in ShouldStopBefore(). + uint64_t overlapped_bytes = 0; + // A flag determine whether the key has been seen in ShouldStopBefore() + bool seen_key = false; + + SubcompactionState(Compaction* c, Slice* _start, Slice* _end, + uint64_t size = 0) + : compaction(c), + start(_start), + end(_end), + outfile(nullptr), + builder(nullptr), + current_output_file_size(0), + total_bytes(0), + num_output_records(0), + approx_size(size), + grandparent_index(0), + overlapped_bytes(0), + seen_key(false) { + assert(compaction != nullptr); + } + + SubcompactionState(SubcompactionState&& o) { *this = std::move(o); } + + SubcompactionState& operator=(SubcompactionState&& o) { + compaction = std::move(o.compaction); + start = std::move(o.start); + end = std::move(o.end); + status = std::move(o.status); + outputs = std::move(o.outputs); + outfile = std::move(o.outfile); + builder = std::move(o.builder); + current_output_file_size = std::move(o.current_output_file_size); + total_bytes = std::move(o.total_bytes); + num_output_records = std::move(o.num_output_records); + compaction_job_stats = std::move(o.compaction_job_stats); + approx_size = std::move(o.approx_size); + grandparent_index = std::move(o.grandparent_index); + overlapped_bytes = std::move(o.overlapped_bytes); + seen_key = std::move(o.seen_key); + return *this; + } + + // Because member std::unique_ptrs do not have these. + SubcompactionState(const SubcompactionState&) = delete; + + SubcompactionState& operator=(const SubcompactionState&) = delete; + + // Returns true iff we should stop building the current output + // before processing "internal_key". + bool ShouldStopBefore(const Slice& internal_key, uint64_t curr_file_size) { + const InternalKeyComparator* icmp = + &compaction->column_family_data()->internal_comparator(); + const std::vector& grandparents = compaction->grandparents(); + + // Scan to find earliest grandparent file that contains key. + while (grandparent_index < grandparents.size() && + icmp->Compare(internal_key, + grandparents[grandparent_index]->largest.Encode()) > + 0) { + if (seen_key) { + overlapped_bytes += grandparents[grandparent_index]->fd.GetFileSize(); + } + assert(grandparent_index + 1 >= grandparents.size() || + icmp->Compare( + grandparents[grandparent_index]->largest.Encode(), + grandparents[grandparent_index + 1]->smallest.Encode()) <= 0); + grandparent_index++; + } + seen_key = true; + + if (overlapped_bytes + curr_file_size > + compaction->max_compaction_bytes()) { + // Too much overlap for current output; start new output + overlapped_bytes = 0; + return true; + } + + return false; + } +}; + +// Maintains state for the entire compaction +struct CompactionJob::CompactionState { + Compaction* const compaction; + + // REQUIRED: subcompaction states are stored in order of increasing + // key-range + std::vector sub_compact_states; + Status status; + + uint64_t total_bytes; + uint64_t num_output_records; + + explicit CompactionState(Compaction* c) + : compaction(c), + total_bytes(0), + num_output_records(0) {} + + size_t NumOutputFiles() { + size_t total = 0; + for (auto& s : sub_compact_states) { + total += s.outputs.size(); + } + return total; + } + + Slice SmallestUserKey() { + for (const auto& sub_compact_state : sub_compact_states) { + if (!sub_compact_state.outputs.empty() && + sub_compact_state.outputs[0].finished) { + return sub_compact_state.outputs[0].meta.smallest.user_key(); + } + } + // If there is no finished output, return an empty slice. + return Slice(nullptr, 0); + } + + Slice LargestUserKey() { + for (auto it = sub_compact_states.rbegin(); it < sub_compact_states.rend(); + ++it) { + if (!it->outputs.empty() && it->current_output()->finished) { + assert(it->current_output() != nullptr); + return it->current_output()->meta.largest.user_key(); + } + } + // If there is no finished output, return an empty slice. + return Slice(nullptr, 0); + } +}; + +void CompactionJob::AggregateStatistics() { + for (SubcompactionState& sc : compact_->sub_compact_states) { + compact_->total_bytes += sc.total_bytes; + compact_->num_output_records += sc.num_output_records; + } + if (compaction_job_stats_) { + for (SubcompactionState& sc : compact_->sub_compact_states) { + compaction_job_stats_->Add(sc.compaction_job_stats); + } + } +} + +CompactionJob::CompactionJob( + int job_id, Compaction* compaction, const ImmutableDBOptions& db_options, + const FileOptions& file_options, VersionSet* versions, + const std::atomic* shutting_down, + const SequenceNumber preserve_deletes_seqnum, LogBuffer* log_buffer, + Directory* db_directory, Directory* output_directory, Statistics* stats, + InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler, + std::vector existing_snapshots, + SequenceNumber earliest_write_conflict_snapshot, + const SnapshotChecker* snapshot_checker, std::shared_ptr table_cache, + EventLogger* event_logger, bool paranoid_file_checks, bool measure_io_stats, + const std::string& dbname, CompactionJobStats* compaction_job_stats, + Env::Priority thread_pri, const std::atomic* manual_compaction_paused) + : job_id_(job_id), + compact_(new CompactionState(compaction)), + compaction_job_stats_(compaction_job_stats), + compaction_stats_(compaction->compaction_reason(), 1), + dbname_(dbname), + db_options_(db_options), + file_options_(file_options), + env_(db_options.env), + fs_(db_options.fs.get()), + file_options_for_read_( + fs_->OptimizeForCompactionTableRead(file_options, db_options_)), + versions_(versions), + shutting_down_(shutting_down), + manual_compaction_paused_(manual_compaction_paused), + preserve_deletes_seqnum_(preserve_deletes_seqnum), + log_buffer_(log_buffer), + db_directory_(db_directory), + output_directory_(output_directory), + stats_(stats), + db_mutex_(db_mutex), + db_error_handler_(db_error_handler), + existing_snapshots_(std::move(existing_snapshots)), + earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot), + snapshot_checker_(snapshot_checker), + table_cache_(std::move(table_cache)), + event_logger_(event_logger), + bottommost_level_(false), + paranoid_file_checks_(paranoid_file_checks), + measure_io_stats_(measure_io_stats), + write_hint_(Env::WLTH_NOT_SET), + thread_pri_(thread_pri) { + assert(log_buffer_ != nullptr); + const auto* cfd = compact_->compaction->column_family_data(); + ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env, + db_options_.enable_thread_tracking); + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION); + ReportStartedCompaction(compaction); +} + +CompactionJob::~CompactionJob() { + assert(compact_ == nullptr); + ThreadStatusUtil::ResetThreadStatus(); +} + +void CompactionJob::ReportStartedCompaction(Compaction* compaction) { + const auto* cfd = compact_->compaction->column_family_data(); + ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env, + db_options_.enable_thread_tracking); + + ThreadStatusUtil::SetThreadOperationProperty(ThreadStatus::COMPACTION_JOB_ID, + job_id_); + + ThreadStatusUtil::SetThreadOperationProperty( + ThreadStatus::COMPACTION_INPUT_OUTPUT_LEVEL, + (static_cast(compact_->compaction->start_level()) << 32) + + compact_->compaction->output_level()); + + // In the current design, a CompactionJob is always created + // for non-trivial compaction. + assert(compaction->IsTrivialMove() == false || + compaction->is_manual_compaction() == true); + + ThreadStatusUtil::SetThreadOperationProperty( + ThreadStatus::COMPACTION_PROP_FLAGS, + compaction->is_manual_compaction() + + (compaction->deletion_compaction() << 1)); + + ThreadStatusUtil::SetThreadOperationProperty( + ThreadStatus::COMPACTION_TOTAL_INPUT_BYTES, + compaction->CalculateTotalInputSize()); + + IOSTATS_RESET(bytes_written); + IOSTATS_RESET(bytes_read); + ThreadStatusUtil::SetThreadOperationProperty( + ThreadStatus::COMPACTION_BYTES_WRITTEN, 0); + ThreadStatusUtil::SetThreadOperationProperty( + ThreadStatus::COMPACTION_BYTES_READ, 0); + + // Set the thread operation after operation properties + // to ensure GetThreadList() can always show them all together. + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION); + + if (compaction_job_stats_) { + compaction_job_stats_->is_manual_compaction = + compaction->is_manual_compaction(); + } +} + +void CompactionJob::Prepare() { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_COMPACTION_PREPARE); + + // Generate file_levels_ for compaction berfore making Iterator + auto* c = compact_->compaction; + assert(c->column_family_data() != nullptr); + assert(c->column_family_data()->current()->storage_info()->NumLevelFiles( + compact_->compaction->level()) > 0); + + write_hint_ = + c->column_family_data()->CalculateSSTWriteHint(c->output_level()); + bottommost_level_ = c->bottommost_level(); + + if (c->ShouldFormSubcompactions()) { + { + StopWatch sw(env_, stats_, SUBCOMPACTION_SETUP_TIME); + GenSubcompactionBoundaries(); + } + assert(sizes_.size() == boundaries_.size() + 1); + + for (size_t i = 0; i <= boundaries_.size(); i++) { + Slice* start = i == 0 ? nullptr : &boundaries_[i - 1]; + Slice* end = i == boundaries_.size() ? nullptr : &boundaries_[i]; + compact_->sub_compact_states.emplace_back(c, start, end, sizes_[i]); + } + RecordInHistogram(stats_, NUM_SUBCOMPACTIONS_SCHEDULED, + compact_->sub_compact_states.size()); + } else { + compact_->sub_compact_states.emplace_back(c, nullptr, nullptr); + } +} + +struct RangeWithSize { + Range range; + uint64_t size; + + RangeWithSize(const Slice& a, const Slice& b, uint64_t s = 0) + : range(a, b), size(s) {} +}; + +void CompactionJob::GenSubcompactionBoundaries() { + auto* c = compact_->compaction; + auto* cfd = c->column_family_data(); + const Comparator* cfd_comparator = cfd->user_comparator(); + std::vector bounds; + int start_lvl = c->start_level(); + int out_lvl = c->output_level(); + + // Add the starting and/or ending key of certain input files as a potential + // boundary + for (size_t lvl_idx = 0; lvl_idx < c->num_input_levels(); lvl_idx++) { + int lvl = c->level(lvl_idx); + if (lvl >= start_lvl && lvl <= out_lvl) { + const LevelFilesBrief* flevel = c->input_levels(lvl_idx); + size_t num_files = flevel->num_files; + + if (num_files == 0) { + continue; + } + + if (lvl == 0) { + // For level 0 add the starting and ending key of each file since the + // files may have greatly differing key ranges (not range-partitioned) + for (size_t i = 0; i < num_files; i++) { + bounds.emplace_back(flevel->files[i].smallest_key); + bounds.emplace_back(flevel->files[i].largest_key); + } + } else { + // For all other levels add the smallest/largest key in the level to + // encompass the range covered by that level + bounds.emplace_back(flevel->files[0].smallest_key); + bounds.emplace_back(flevel->files[num_files - 1].largest_key); + if (lvl == out_lvl) { + // For the last level include the starting keys of all files since + // the last level is the largest and probably has the widest key + // range. Since it's range partitioned, the ending key of one file + // and the starting key of the next are very close (or identical). + for (size_t i = 1; i < num_files; i++) { + bounds.emplace_back(flevel->files[i].smallest_key); + } + } + } + } + } + + std::sort(bounds.begin(), bounds.end(), + [cfd_comparator](const Slice& a, const Slice& b) -> bool { + return cfd_comparator->Compare(ExtractUserKey(a), + ExtractUserKey(b)) < 0; + }); + // Remove duplicated entries from bounds + bounds.erase( + std::unique(bounds.begin(), bounds.end(), + [cfd_comparator](const Slice& a, const Slice& b) -> bool { + return cfd_comparator->Compare(ExtractUserKey(a), + ExtractUserKey(b)) == 0; + }), + bounds.end()); + + // Combine consecutive pairs of boundaries into ranges with an approximate + // size of data covered by keys in that range + uint64_t sum = 0; + std::vector ranges; + // Get input version from CompactionState since it's already referenced + // earlier in SetInputVersioCompaction::SetInputVersion and will not change + // when db_mutex_ is released below + auto* v = compact_->compaction->input_version(); + for (auto it = bounds.begin();;) { + const Slice a = *it; + ++it; + + if (it == bounds.end()) { + break; + } + + const Slice b = *it; + + // ApproximateSize could potentially create table reader iterator to seek + // to the index block and may incur I/O cost in the process. Unlock db + // mutex to reduce contention + db_mutex_->Unlock(); + uint64_t size = versions_->ApproximateSize(SizeApproximationOptions(), v, a, + b, start_lvl, out_lvl + 1, + TableReaderCaller::kCompaction); + db_mutex_->Lock(); + ranges.emplace_back(a, b, size); + sum += size; + } + + // Group the ranges into subcompactions + const double min_file_fill_percent = 4.0 / 5; + int base_level = v->storage_info()->base_level(); + uint64_t max_output_files = static_cast(std::ceil( + sum / min_file_fill_percent / + MaxFileSizeForLevel(*(c->mutable_cf_options()), out_lvl, + c->immutable_cf_options()->compaction_style, base_level, + c->immutable_cf_options()->level_compaction_dynamic_level_bytes))); + uint64_t subcompactions = + std::min({static_cast(ranges.size()), + static_cast(c->max_subcompactions()), + max_output_files}); + + if (subcompactions > 1) { + double mean = sum * 1.0 / subcompactions; + // Greedily add ranges to the subcompaction until the sum of the ranges' + // sizes becomes >= the expected mean size of a subcompaction + sum = 0; + for (size_t i = 0; i < ranges.size() - 1; i++) { + sum += ranges[i].size; + if (subcompactions == 1) { + // If there's only one left to schedule then it goes to the end so no + // need to put an end boundary + continue; + } + if (sum >= mean) { + boundaries_.emplace_back(ExtractUserKey(ranges[i].range.limit)); + sizes_.emplace_back(sum); + subcompactions--; + sum = 0; + } + } + sizes_.emplace_back(sum + ranges.back().size); + } else { + // Only one range so its size is the total sum of sizes computed above + sizes_.emplace_back(sum); + } +} + +Status CompactionJob::Run() { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_COMPACTION_RUN); + TEST_SYNC_POINT("CompactionJob::Run():Start"); + log_buffer_->FlushBufferToLog(); + LogCompaction(); + + const size_t num_threads = compact_->sub_compact_states.size(); + assert(num_threads > 0); + const uint64_t start_micros = env_->NowMicros(); + + // Launch a thread for each of subcompactions 1...num_threads-1 + std::vector thread_pool; + thread_pool.reserve(num_threads - 1); + for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) { + thread_pool.emplace_back(&CompactionJob::ProcessKeyValueCompaction, this, + &compact_->sub_compact_states[i]); + } + + // Always schedule the first subcompaction (whether or not there are also + // others) in the current thread to be efficient with resources + ProcessKeyValueCompaction(&compact_->sub_compact_states[0]); + + // Wait for all other threads (if there are any) to finish execution + for (auto& thread : thread_pool) { + thread.join(); + } + + compaction_stats_.micros = env_->NowMicros() - start_micros; + compaction_stats_.cpu_micros = 0; + for (size_t i = 0; i < compact_->sub_compact_states.size(); i++) { + compaction_stats_.cpu_micros += + compact_->sub_compact_states[i].compaction_job_stats.cpu_micros; + } + + RecordTimeToHistogram(stats_, COMPACTION_TIME, compaction_stats_.micros); + RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME, + compaction_stats_.cpu_micros); + + TEST_SYNC_POINT("CompactionJob::Run:BeforeVerify"); + + // Check if any thread encountered an error during execution + Status status; + for (const auto& state : compact_->sub_compact_states) { + if (!state.status.ok()) { + status = state.status; + break; + } + } + + if (status.ok() && output_directory_) { + status = output_directory_->Fsync(); + } + + if (status.ok()) { + thread_pool.clear(); + std::vector files_meta; + for (const auto& state : compact_->sub_compact_states) { + for (const auto& output : state.outputs) { + files_meta.emplace_back(&output.meta); + } + } + ColumnFamilyData* cfd = compact_->compaction->column_family_data(); + auto prefix_extractor = + compact_->compaction->mutable_cf_options()->prefix_extractor.get(); + std::atomic next_file_meta_idx(0); + auto verify_table = [&](Status& output_status) { + while (true) { + size_t file_idx = next_file_meta_idx.fetch_add(1); + if (file_idx >= files_meta.size()) { + break; + } + // Verify that the table is usable + // We set for_compaction to false and don't OptimizeForCompactionTableRead + // here because this is a special case after we finish the table building + // No matter whether use_direct_io_for_flush_and_compaction is true, + // we will regard this verification as user reads since the goal is + // to cache it here for further user reads + InternalIterator* iter = cfd->table_cache()->NewIterator( + ReadOptions(), file_options_, cfd->internal_comparator(), + *files_meta[file_idx], /*range_del_agg=*/nullptr, prefix_extractor, + /*table_reader_ptr=*/nullptr, + cfd->internal_stats()->GetFileReadHist( + compact_->compaction->output_level()), + TableReaderCaller::kCompactionRefill, /*arena=*/nullptr, + /*skip_filters=*/false, compact_->compaction->output_level(), + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr); + auto s = iter->status(); + + if (s.ok() && paranoid_file_checks_) { + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {} + s = iter->status(); + } + + delete iter; + + if (!s.ok()) { + output_status = s; + break; + } + } + }; + for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) { + thread_pool.emplace_back(verify_table, + std::ref(compact_->sub_compact_states[i].status)); + } + verify_table(compact_->sub_compact_states[0].status); + for (auto& thread : thread_pool) { + thread.join(); + } + for (const auto& state : compact_->sub_compact_states) { + if (!state.status.ok()) { + status = state.status; + break; + } + } + } + + TablePropertiesCollection tp; + for (const auto& state : compact_->sub_compact_states) { + for (const auto& output : state.outputs) { + auto fn = + TableFileName(state.compaction->immutable_cf_options()->cf_paths, + output.meta.fd.GetNumber(), output.meta.fd.GetPathId()); + tp[fn] = output.table_properties; + } + } + compact_->compaction->SetOutputTableProperties(std::move(tp)); + + // Finish up all book-keeping to unify the subcompaction results + AggregateStatistics(); + UpdateCompactionStats(); + RecordCompactionIOStats(); + LogFlush(db_options_.info_log); + TEST_SYNC_POINT("CompactionJob::Run():End"); + + compact_->status = status; + return status; +} + +Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_COMPACTION_INSTALL); + db_mutex_->AssertHeld(); + Status status = compact_->status; + ColumnFamilyData* cfd = compact_->compaction->column_family_data(); + cfd->internal_stats()->AddCompactionStats( + compact_->compaction->output_level(), thread_pri_, compaction_stats_); + + if (status.ok()) { + status = InstallCompactionResults(mutable_cf_options); + } + VersionStorageInfo::LevelSummaryStorage tmp; + auto vstorage = cfd->current()->storage_info(); + const auto& stats = compaction_stats_; + + double read_write_amp = 0.0; + double write_amp = 0.0; + double bytes_read_per_sec = 0; + double bytes_written_per_sec = 0; + + if (stats.bytes_read_non_output_levels > 0) { + read_write_amp = (stats.bytes_written + stats.bytes_read_output_level + + stats.bytes_read_non_output_levels) / + static_cast(stats.bytes_read_non_output_levels); + write_amp = stats.bytes_written / + static_cast(stats.bytes_read_non_output_levels); + } + if (stats.micros > 0) { + bytes_read_per_sec = + (stats.bytes_read_non_output_levels + stats.bytes_read_output_level) / + static_cast(stats.micros); + bytes_written_per_sec = + stats.bytes_written / static_cast(stats.micros); + } + + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] compacted to: %s, MB/sec: %.1f rd, %.1f wr, level %d, " + "files in(%d, %d) out(%d) " + "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) " + "write-amplify(%.1f) %s, records in: %" PRIu64 + ", records dropped: %" PRIu64 " output_compression: %s\n", + cfd->GetName().c_str(), vstorage->LevelSummary(&tmp), bytes_read_per_sec, + bytes_written_per_sec, compact_->compaction->output_level(), + stats.num_input_files_in_non_output_levels, + stats.num_input_files_in_output_level, stats.num_output_files, + stats.bytes_read_non_output_levels / 1048576.0, + stats.bytes_read_output_level / 1048576.0, + stats.bytes_written / 1048576.0, read_write_amp, write_amp, + status.ToString().c_str(), stats.num_input_records, + stats.num_dropped_records, + CompressionTypeToString(compact_->compaction->output_compression()) + .c_str()); + + UpdateCompactionJobStats(stats); + + auto stream = event_logger_->LogToBuffer(log_buffer_); + stream << "job" << job_id_ << "event" + << "compaction_finished" + << "compaction_time_micros" << stats.micros + << "compaction_time_cpu_micros" << stats.cpu_micros << "output_level" + << compact_->compaction->output_level() << "num_output_files" + << compact_->NumOutputFiles() << "total_output_size" + << compact_->total_bytes << "num_input_records" + << stats.num_input_records << "num_output_records" + << compact_->num_output_records << "num_subcompactions" + << compact_->sub_compact_states.size() << "output_compression" + << CompressionTypeToString(compact_->compaction->output_compression()); + + if (compaction_job_stats_ != nullptr) { + stream << "num_single_delete_mismatches" + << compaction_job_stats_->num_single_del_mismatch; + stream << "num_single_delete_fallthrough" + << compaction_job_stats_->num_single_del_fallthru; + } + + if (measure_io_stats_ && compaction_job_stats_ != nullptr) { + stream << "file_write_nanos" << compaction_job_stats_->file_write_nanos; + stream << "file_range_sync_nanos" + << compaction_job_stats_->file_range_sync_nanos; + stream << "file_fsync_nanos" << compaction_job_stats_->file_fsync_nanos; + stream << "file_prepare_write_nanos" + << compaction_job_stats_->file_prepare_write_nanos; + } + + stream << "lsm_state"; + stream.StartArray(); + for (int level = 0; level < vstorage->num_levels(); ++level) { + stream << vstorage->NumLevelFiles(level); + } + stream.EndArray(); + + CleanupCompaction(); + return status; +} + +void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { + assert(sub_compact != nullptr); + + uint64_t prev_cpu_micros = env_->NowCPUNanos() / 1000; + + ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); + + // Create compaction filter and fail the compaction if + // IgnoreSnapshots() = false because it is not supported anymore + const CompactionFilter* compaction_filter = + cfd->ioptions()->compaction_filter; + std::unique_ptr compaction_filter_from_factory = nullptr; + if (compaction_filter == nullptr) { + compaction_filter_from_factory = + sub_compact->compaction->CreateCompactionFilter(); + compaction_filter = compaction_filter_from_factory.get(); + } + if (compaction_filter != nullptr && !compaction_filter->IgnoreSnapshots()) { + sub_compact->status = Status::NotSupported( + "CompactionFilter::IgnoreSnapshots() = false is not supported " + "anymore."); + return; + } + + CompactionRangeDelAggregator range_del_agg(&cfd->internal_comparator(), + existing_snapshots_); + + // Although the v2 aggregator is what the level iterator(s) know about, + // the AddTombstones calls will be propagated down to the v1 aggregator. + std::unique_ptr input(versions_->MakeInputIterator( + sub_compact->compaction, &range_del_agg, file_options_for_read_)); + + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_COMPACTION_PROCESS_KV); + + // I/O measurement variables + PerfLevel prev_perf_level = PerfLevel::kEnableTime; + const uint64_t kRecordStatsEvery = 1000; + uint64_t prev_write_nanos = 0; + uint64_t prev_fsync_nanos = 0; + uint64_t prev_range_sync_nanos = 0; + uint64_t prev_prepare_write_nanos = 0; + uint64_t prev_cpu_write_nanos = 0; + uint64_t prev_cpu_read_nanos = 0; + if (measure_io_stats_) { + prev_perf_level = GetPerfLevel(); + SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); + prev_write_nanos = IOSTATS(write_nanos); + prev_fsync_nanos = IOSTATS(fsync_nanos); + prev_range_sync_nanos = IOSTATS(range_sync_nanos); + prev_prepare_write_nanos = IOSTATS(prepare_write_nanos); + prev_cpu_write_nanos = IOSTATS(cpu_write_nanos); + prev_cpu_read_nanos = IOSTATS(cpu_read_nanos); + } + + MergeHelper merge( + env_, cfd->user_comparator(), cfd->ioptions()->merge_operator, + compaction_filter, db_options_.info_log.get(), + false /* internal key corruption is expected */, + existing_snapshots_.empty() ? 0 : existing_snapshots_.back(), + snapshot_checker_, compact_->compaction->level(), + db_options_.statistics.get()); + + TEST_SYNC_POINT("CompactionJob::Run():Inprogress"); + TEST_SYNC_POINT_CALLBACK( + "CompactionJob::Run():PausingManualCompaction:1", + reinterpret_cast( + const_cast*>(manual_compaction_paused_))); + + Slice* start = sub_compact->start; + Slice* end = sub_compact->end; + if (start != nullptr) { + IterKey start_iter; + start_iter.SetInternalKey(*start, kMaxSequenceNumber, kValueTypeForSeek); + input->Seek(start_iter.GetInternalKey()); + } else { + input->SeekToFirst(); + } + + Status status; + sub_compact->c_iter.reset(new CompactionIterator( + input.get(), cfd->user_comparator(), &merge, versions_->LastSequence(), + &existing_snapshots_, earliest_write_conflict_snapshot_, + snapshot_checker_, env_, ShouldReportDetailedTime(env_, stats_), false, + &range_del_agg, sub_compact->compaction, compaction_filter, + shutting_down_, preserve_deletes_seqnum_, manual_compaction_paused_, + db_options_.info_log)); + auto c_iter = sub_compact->c_iter.get(); + c_iter->SeekToFirst(); + if (c_iter->Valid() && sub_compact->compaction->output_level() != 0) { + // ShouldStopBefore() maintains state based on keys processed so far. The + // compaction loop always calls it on the "next" key, thus won't tell it the + // first key. So we do that here. + sub_compact->ShouldStopBefore(c_iter->key(), + sub_compact->current_output_file_size); + } + const auto& c_iter_stats = c_iter->iter_stats(); + + while (status.ok() && !cfd->IsDropped() && c_iter->Valid()) { + // Invariant: c_iter.status() is guaranteed to be OK if c_iter->Valid() + // returns true. + const Slice& key = c_iter->key(); + const Slice& value = c_iter->value(); + + // If an end key (exclusive) is specified, check if the current key is + // >= than it and exit if it is because the iterator is out of its range + if (end != nullptr && + cfd->user_comparator()->Compare(c_iter->user_key(), *end) >= 0) { + break; + } + if (c_iter_stats.num_input_records % kRecordStatsEvery == + kRecordStatsEvery - 1) { + RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats); + c_iter->ResetRecordCounts(); + RecordCompactionIOStats(); + } + + // Open output file if necessary + if (sub_compact->builder == nullptr) { + status = OpenCompactionOutputFile(sub_compact); + if (!status.ok()) { + break; + } + } + assert(sub_compact->builder != nullptr); + assert(sub_compact->current_output() != nullptr); + sub_compact->builder->Add(key, value); + sub_compact->current_output_file_size = sub_compact->builder->FileSize(); + const ParsedInternalKey& ikey = c_iter->ikey(); + sub_compact->current_output()->meta.UpdateBoundaries( + key, value, ikey.sequence, ikey.type); + sub_compact->num_output_records++; + + // Close output file if it is big enough. Two possibilities determine it's + // time to close it: (1) the current key should be this file's last key, (2) + // the next key should not be in this file. + // + // TODO(aekmekji): determine if file should be closed earlier than this + // during subcompactions (i.e. if output size, estimated by input size, is + // going to be 1.2MB and max_output_file_size = 1MB, prefer to have 0.6MB + // and 0.6MB instead of 1MB and 0.2MB) + bool output_file_ended = false; + Status input_status; + if (sub_compact->compaction->output_level() != 0 && + sub_compact->current_output_file_size >= + sub_compact->compaction->max_output_file_size()) { + // (1) this key terminates the file. For historical reasons, the iterator + // status before advancing will be given to FinishCompactionOutputFile(). + input_status = input->status(); + output_file_ended = true; + } + TEST_SYNC_POINT_CALLBACK( + "CompactionJob::Run():PausingManualCompaction:2", + reinterpret_cast( + const_cast*>(manual_compaction_paused_))); + c_iter->Next(); + if (c_iter->status().IsManualCompactionPaused()) { + break; + } + if (!output_file_ended && c_iter->Valid() && + sub_compact->compaction->output_level() != 0 && + sub_compact->ShouldStopBefore(c_iter->key(), + sub_compact->current_output_file_size) && + sub_compact->builder != nullptr) { + // (2) this key belongs to the next file. For historical reasons, the + // iterator status after advancing will be given to + // FinishCompactionOutputFile(). + input_status = input->status(); + output_file_ended = true; + } + if (output_file_ended) { + const Slice* next_key = nullptr; + if (c_iter->Valid()) { + next_key = &c_iter->key(); + } + CompactionIterationStats range_del_out_stats; + status = + FinishCompactionOutputFile(input_status, sub_compact, &range_del_agg, + &range_del_out_stats, next_key); + RecordDroppedKeys(range_del_out_stats, + &sub_compact->compaction_job_stats); + } + } + + sub_compact->compaction_job_stats.num_input_deletion_records = + c_iter_stats.num_input_deletion_records; + sub_compact->compaction_job_stats.num_corrupt_keys = + c_iter_stats.num_input_corrupt_records; + sub_compact->compaction_job_stats.num_single_del_fallthru = + c_iter_stats.num_single_del_fallthru; + sub_compact->compaction_job_stats.num_single_del_mismatch = + c_iter_stats.num_single_del_mismatch; + sub_compact->compaction_job_stats.total_input_raw_key_bytes += + c_iter_stats.total_input_raw_key_bytes; + sub_compact->compaction_job_stats.total_input_raw_value_bytes += + c_iter_stats.total_input_raw_value_bytes; + + RecordTick(stats_, FILTER_OPERATION_TOTAL_TIME, + c_iter_stats.total_filter_time); + RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats); + RecordCompactionIOStats(); + + if (status.ok() && cfd->IsDropped()) { + status = + Status::ColumnFamilyDropped("Column family dropped during compaction"); + } + if ((status.ok() || status.IsColumnFamilyDropped()) && + shutting_down_->load(std::memory_order_relaxed)) { + status = Status::ShutdownInProgress("Database shutdown"); + } + if ((status.ok() || status.IsColumnFamilyDropped()) && + (manual_compaction_paused_ && + manual_compaction_paused_->load(std::memory_order_relaxed))) { + status = Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } + if (status.ok()) { + status = input->status(); + } + if (status.ok()) { + status = c_iter->status(); + } + + if (status.ok() && sub_compact->builder == nullptr && + sub_compact->outputs.size() == 0 && !range_del_agg.IsEmpty()) { + // handle subcompaction containing only range deletions + status = OpenCompactionOutputFile(sub_compact); + } + + // Call FinishCompactionOutputFile() even if status is not ok: it needs to + // close the output file. + if (sub_compact->builder != nullptr) { + CompactionIterationStats range_del_out_stats; + Status s = FinishCompactionOutputFile(status, sub_compact, &range_del_agg, + &range_del_out_stats); + if (status.ok()) { + status = s; + } + RecordDroppedKeys(range_del_out_stats, &sub_compact->compaction_job_stats); + } + + sub_compact->compaction_job_stats.cpu_micros = + env_->NowCPUNanos() / 1000 - prev_cpu_micros; + + if (measure_io_stats_) { + sub_compact->compaction_job_stats.file_write_nanos += + IOSTATS(write_nanos) - prev_write_nanos; + sub_compact->compaction_job_stats.file_fsync_nanos += + IOSTATS(fsync_nanos) - prev_fsync_nanos; + sub_compact->compaction_job_stats.file_range_sync_nanos += + IOSTATS(range_sync_nanos) - prev_range_sync_nanos; + sub_compact->compaction_job_stats.file_prepare_write_nanos += + IOSTATS(prepare_write_nanos) - prev_prepare_write_nanos; + sub_compact->compaction_job_stats.cpu_micros -= + (IOSTATS(cpu_write_nanos) - prev_cpu_write_nanos + + IOSTATS(cpu_read_nanos) - prev_cpu_read_nanos) / + 1000; + if (prev_perf_level != PerfLevel::kEnableTimeAndCPUTimeExceptForMutex) { + SetPerfLevel(prev_perf_level); + } + } + + sub_compact->c_iter.reset(); + input.reset(); + sub_compact->status = status; +} + +void CompactionJob::RecordDroppedKeys( + const CompactionIterationStats& c_iter_stats, + CompactionJobStats* compaction_job_stats) { + if (c_iter_stats.num_record_drop_user > 0) { + RecordTick(stats_, COMPACTION_KEY_DROP_USER, + c_iter_stats.num_record_drop_user); + } + if (c_iter_stats.num_record_drop_hidden > 0) { + RecordTick(stats_, COMPACTION_KEY_DROP_NEWER_ENTRY, + c_iter_stats.num_record_drop_hidden); + if (compaction_job_stats) { + compaction_job_stats->num_records_replaced += + c_iter_stats.num_record_drop_hidden; + } + } + if (c_iter_stats.num_record_drop_obsolete > 0) { + RecordTick(stats_, COMPACTION_KEY_DROP_OBSOLETE, + c_iter_stats.num_record_drop_obsolete); + if (compaction_job_stats) { + compaction_job_stats->num_expired_deletion_records += + c_iter_stats.num_record_drop_obsolete; + } + } + if (c_iter_stats.num_record_drop_range_del > 0) { + RecordTick(stats_, COMPACTION_KEY_DROP_RANGE_DEL, + c_iter_stats.num_record_drop_range_del); + } + if (c_iter_stats.num_range_del_drop_obsolete > 0) { + RecordTick(stats_, COMPACTION_RANGE_DEL_DROP_OBSOLETE, + c_iter_stats.num_range_del_drop_obsolete); + } + if (c_iter_stats.num_optimized_del_drop_obsolete > 0) { + RecordTick(stats_, COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE, + c_iter_stats.num_optimized_del_drop_obsolete); + } +} + +Status CompactionJob::FinishCompactionOutputFile( + const Status& input_status, SubcompactionState* sub_compact, + CompactionRangeDelAggregator* range_del_agg, + CompactionIterationStats* range_del_out_stats, + const Slice* next_table_min_key /* = nullptr */) { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_COMPACTION_SYNC_FILE); + assert(sub_compact != nullptr); + assert(sub_compact->outfile); + assert(sub_compact->builder != nullptr); + assert(sub_compact->current_output() != nullptr); + + uint64_t output_number = sub_compact->current_output()->meta.fd.GetNumber(); + assert(output_number != 0); + + ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); + const Comparator* ucmp = cfd->user_comparator(); + + // Check for iterator errors + Status s = input_status; + auto meta = &sub_compact->current_output()->meta; + assert(meta != nullptr); + if (s.ok()) { + Slice lower_bound_guard, upper_bound_guard; + std::string smallest_user_key; + const Slice *lower_bound, *upper_bound; + bool lower_bound_from_sub_compact = false; + if (sub_compact->outputs.size() == 1) { + // For the first output table, include range tombstones before the min key + // but after the subcompaction boundary. + lower_bound = sub_compact->start; + lower_bound_from_sub_compact = true; + } else if (meta->smallest.size() > 0) { + // For subsequent output tables, only include range tombstones from min + // key onwards since the previous file was extended to contain range + // tombstones falling before min key. + smallest_user_key = meta->smallest.user_key().ToString(false /*hex*/); + lower_bound_guard = Slice(smallest_user_key); + lower_bound = &lower_bound_guard; + } else { + lower_bound = nullptr; + } + if (next_table_min_key != nullptr) { + // This may be the last file in the subcompaction in some cases, so we + // need to compare the end key of subcompaction with the next file start + // key. When the end key is chosen by the subcompaction, we know that + // it must be the biggest key in output file. Therefore, it is safe to + // use the smaller key as the upper bound of the output file, to ensure + // that there is no overlapping between different output files. + upper_bound_guard = ExtractUserKey(*next_table_min_key); + if (sub_compact->end != nullptr && + ucmp->Compare(upper_bound_guard, *sub_compact->end) >= 0) { + upper_bound = sub_compact->end; + } else { + upper_bound = &upper_bound_guard; + } + } else { + // This is the last file in the subcompaction, so extend until the + // subcompaction ends. + upper_bound = sub_compact->end; + } + auto earliest_snapshot = kMaxSequenceNumber; + if (existing_snapshots_.size() > 0) { + earliest_snapshot = existing_snapshots_[0]; + } + bool has_overlapping_endpoints; + if (upper_bound != nullptr && meta->largest.size() > 0) { + has_overlapping_endpoints = + ucmp->Compare(meta->largest.user_key(), *upper_bound) == 0; + } else { + has_overlapping_endpoints = false; + } + + // The end key of the subcompaction must be bigger or equal to the upper + // bound. If the end of subcompaction is null or the upper bound is null, + // it means that this file is the last file in the compaction. So there + // will be no overlapping between this file and others. + assert(sub_compact->end == nullptr || + upper_bound == nullptr || + ucmp->Compare(*upper_bound , *sub_compact->end) <= 0); + auto it = range_del_agg->NewIterator(lower_bound, upper_bound, + has_overlapping_endpoints); + // Position the range tombstone output iterator. There may be tombstone + // fragments that are entirely out of range, so make sure that we do not + // include those. + if (lower_bound != nullptr) { + it->Seek(*lower_bound); + } else { + it->SeekToFirst(); + } + for (; it->Valid(); it->Next()) { + auto tombstone = it->Tombstone(); + if (upper_bound != nullptr) { + int cmp = ucmp->Compare(*upper_bound, tombstone.start_key_); + if ((has_overlapping_endpoints && cmp < 0) || + (!has_overlapping_endpoints && cmp <= 0)) { + // Tombstones starting after upper_bound only need to be included in + // the next table. If the current SST ends before upper_bound, i.e., + // `has_overlapping_endpoints == false`, we can also skip over range + // tombstones that start exactly at upper_bound. Such range tombstones + // will be included in the next file and are not relevant to the point + // keys or endpoints of the current file. + break; + } + } + + if (bottommost_level_ && tombstone.seq_ <= earliest_snapshot) { + // TODO(andrewkr): tombstones that span multiple output files are + // counted for each compaction output file, so lots of double counting. + range_del_out_stats->num_range_del_drop_obsolete++; + range_del_out_stats->num_record_drop_obsolete++; + continue; + } + + auto kv = tombstone.Serialize(); + assert(lower_bound == nullptr || + ucmp->Compare(*lower_bound, kv.second) < 0); + sub_compact->builder->Add(kv.first.Encode(), kv.second); + InternalKey smallest_candidate = std::move(kv.first); + if (lower_bound != nullptr && + ucmp->Compare(smallest_candidate.user_key(), *lower_bound) <= 0) { + // Pretend the smallest key has the same user key as lower_bound + // (the max key in the previous table or subcompaction) in order for + // files to appear key-space partitioned. + // + // When lower_bound is chosen by a subcompaction, we know that + // subcompactions over smaller keys cannot contain any keys at + // lower_bound. We also know that smaller subcompactions exist, because + // otherwise the subcompaction woud be unbounded on the left. As a + // result, we know that no other files on the output level will contain + // actual keys at lower_bound (an output file may have a largest key of + // lower_bound@kMaxSequenceNumber, but this only indicates a large range + // tombstone was truncated). Therefore, it is safe to use the + // tombstone's sequence number, to ensure that keys at lower_bound at + // lower levels are covered by truncated tombstones. + // + // If lower_bound was chosen by the smallest data key in the file, + // choose lowest seqnum so this file's smallest internal key comes after + // the previous file's largest. The fake seqnum is OK because the read + // path's file-picking code only considers user key. + smallest_candidate = InternalKey( + *lower_bound, lower_bound_from_sub_compact ? tombstone.seq_ : 0, + kTypeRangeDeletion); + } + InternalKey largest_candidate = tombstone.SerializeEndKey(); + if (upper_bound != nullptr && + ucmp->Compare(*upper_bound, largest_candidate.user_key()) <= 0) { + // Pretend the largest key has the same user key as upper_bound (the + // min key in the following table or subcompaction) in order for files + // to appear key-space partitioned. + // + // Choose highest seqnum so this file's largest internal key comes + // before the next file's/subcompaction's smallest. The fake seqnum is + // OK because the read path's file-picking code only considers the user + // key portion. + // + // Note Seek() also creates InternalKey with (user_key, + // kMaxSequenceNumber), but with kTypeDeletion (0x7) instead of + // kTypeRangeDeletion (0xF), so the range tombstone comes before the + // Seek() key in InternalKey's ordering. So Seek() will look in the + // next file for the user key. + largest_candidate = + InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion); + } +#ifndef NDEBUG + SequenceNumber smallest_ikey_seqnum = kMaxSequenceNumber; + if (meta->smallest.size() > 0) { + smallest_ikey_seqnum = GetInternalKeySeqno(meta->smallest.Encode()); + } +#endif + meta->UpdateBoundariesForRange(smallest_candidate, largest_candidate, + tombstone.seq_, + cfd->internal_comparator()); + + // The smallest key in a file is used for range tombstone truncation, so + // it cannot have a seqnum of 0 (unless the smallest data key in a file + // has a seqnum of 0). Otherwise, the truncated tombstone may expose + // deleted keys at lower levels. + assert(smallest_ikey_seqnum == 0 || + ExtractInternalKeyFooter(meta->smallest.Encode()) != + PackSequenceAndType(0, kTypeRangeDeletion)); + } + meta->marked_for_compaction = sub_compact->builder->NeedCompact(); + } + const uint64_t current_entries = sub_compact->builder->NumEntries(); + if (s.ok()) { + s = sub_compact->builder->Finish(); + } else { + sub_compact->builder->Abandon(); + } + const uint64_t current_bytes = sub_compact->builder->FileSize(); + if (s.ok()) { + // Add the checksum information to file metadata. + meta->file_checksum = sub_compact->builder->GetFileChecksum(); + meta->file_checksum_func_name = + sub_compact->builder->GetFileChecksumFuncName(); + + meta->fd.file_size = current_bytes; + } + sub_compact->current_output()->finished = true; + sub_compact->total_bytes += current_bytes; + + // Finish and check for file errors + if (s.ok()) { + StopWatch sw(env_, stats_, COMPACTION_OUTFILE_SYNC_MICROS); + s = sub_compact->outfile->Sync(db_options_.use_fsync); + } + if (s.ok()) { + s = sub_compact->outfile->Close(); + } + sub_compact->outfile.reset(); + + TableProperties tp; + if (s.ok()) { + tp = sub_compact->builder->GetTableProperties(); + } + + if (s.ok() && current_entries == 0 && tp.num_range_deletions == 0) { + // If there is nothing to output, no necessary to generate a sst file. + // This happens when the output level is bottom level, at the same time + // the sub_compact output nothing. + std::string fname = + TableFileName(sub_compact->compaction->immutable_cf_options()->cf_paths, + meta->fd.GetNumber(), meta->fd.GetPathId()); + env_->DeleteFile(fname); + + // Also need to remove the file from outputs, or it will be added to the + // VersionEdit. + assert(!sub_compact->outputs.empty()); + sub_compact->outputs.pop_back(); + meta = nullptr; + } + + if (s.ok() && (current_entries > 0 || tp.num_range_deletions > 0)) { + // Output to event logger and fire events. + sub_compact->current_output()->table_properties = + std::make_shared(tp); + ROCKS_LOG_INFO(db_options_.info_log, + "[%s] [JOB %d] Generated table #%" PRIu64 ": %" PRIu64 + " keys, %" PRIu64 " bytes%s", + cfd->GetName().c_str(), job_id_, output_number, + current_entries, current_bytes, + meta->marked_for_compaction ? " (need compaction)" : ""); + } + std::string fname; + FileDescriptor output_fd; + uint64_t oldest_blob_file_number = kInvalidBlobFileNumber; + if (meta != nullptr) { + fname = + TableFileName(sub_compact->compaction->immutable_cf_options()->cf_paths, + meta->fd.GetNumber(), meta->fd.GetPathId()); + output_fd = meta->fd; + oldest_blob_file_number = meta->oldest_blob_file_number; + } else { + fname = "(nil)"; + } + EventHelpers::LogAndNotifyTableFileCreationFinished( + event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname, + job_id_, output_fd, oldest_blob_file_number, tp, + TableFileCreationReason::kCompaction, s); + +#ifndef ROCKSDB_LITE + // Report new file to SstFileManagerImpl + auto sfm = + static_cast(db_options_.sst_file_manager.get()); + if (sfm && meta != nullptr && meta->fd.GetPathId() == 0) { + sfm->OnAddFile(fname); + if (sfm->IsMaxAllowedSpaceReached()) { + // TODO(ajkr): should we return OK() if max space was reached by the final + // compaction output file (similarly to how flush works when full)? + s = Status::SpaceLimit("Max allowed space was reached"); + TEST_SYNC_POINT( + "CompactionJob::FinishCompactionOutputFile:" + "MaxAllowedSpaceReached"); + InstrumentedMutexLock l(db_mutex_); + db_error_handler_->SetBGError(s, BackgroundErrorReason::kCompaction); + } + } +#endif + + sub_compact->builder.reset(); + sub_compact->current_output_file_size = 0; + return s; +} + +Status CompactionJob::InstallCompactionResults( + const MutableCFOptions& mutable_cf_options) { + db_mutex_->AssertHeld(); + + auto* compaction = compact_->compaction; + // paranoia: verify that the files that we started with + // still exist in the current version and in the same original level. + // This ensures that a concurrent compaction did not erroneously + // pick the same files to compact_. + if (!versions_->VerifyCompactionFileConsistency(compaction)) { + Compaction::InputLevelSummaryBuffer inputs_summary; + + ROCKS_LOG_ERROR(db_options_.info_log, "[%s] [JOB %d] Compaction %s aborted", + compaction->column_family_data()->GetName().c_str(), + job_id_, compaction->InputLevelSummary(&inputs_summary)); + return Status::Corruption("Compaction input files inconsistent"); + } + + { + Compaction::InputLevelSummaryBuffer inputs_summary; + ROCKS_LOG_INFO( + db_options_.info_log, "[%s] [JOB %d] Compacted %s => %" PRIu64 " bytes", + compaction->column_family_data()->GetName().c_str(), job_id_, + compaction->InputLevelSummary(&inputs_summary), compact_->total_bytes); + } + + // Add compaction inputs + compaction->AddInputDeletions(compact_->compaction->edit()); + + for (const auto& sub_compact : compact_->sub_compact_states) { + for (const auto& out : sub_compact.outputs) { + compaction->edit()->AddFile(compaction->output_level(), out.meta); + } + } + return versions_->LogAndApply(compaction->column_family_data(), + mutable_cf_options, compaction->edit(), + db_mutex_, db_directory_); +} + +void CompactionJob::RecordCompactionIOStats() { + RecordTick(stats_, COMPACT_READ_BYTES, IOSTATS(bytes_read)); + ThreadStatusUtil::IncreaseThreadOperationProperty( + ThreadStatus::COMPACTION_BYTES_READ, IOSTATS(bytes_read)); + IOSTATS_RESET(bytes_read); + RecordTick(stats_, COMPACT_WRITE_BYTES, IOSTATS(bytes_written)); + ThreadStatusUtil::IncreaseThreadOperationProperty( + ThreadStatus::COMPACTION_BYTES_WRITTEN, IOSTATS(bytes_written)); + IOSTATS_RESET(bytes_written); +} + +Status CompactionJob::OpenCompactionOutputFile( + SubcompactionState* sub_compact) { + assert(sub_compact != nullptr); + assert(sub_compact->builder == nullptr); + // no need to lock because VersionSet::next_file_number_ is atomic + uint64_t file_number = versions_->NewFileNumber(); + std::string fname = + TableFileName(sub_compact->compaction->immutable_cf_options()->cf_paths, + file_number, sub_compact->compaction->output_path_id()); + // Fire events. + ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); +#ifndef ROCKSDB_LITE + EventHelpers::NotifyTableFileCreationStarted( + cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname, job_id_, + TableFileCreationReason::kCompaction); +#endif // !ROCKSDB_LITE + // Make the output file + std::unique_ptr writable_file; +#ifndef NDEBUG + bool syncpoint_arg = file_options_.use_direct_writes; + TEST_SYNC_POINT_CALLBACK("CompactionJob::OpenCompactionOutputFile", + &syncpoint_arg); +#endif + Status s = NewWritableFile(fs_, fname, &writable_file, file_options_); + if (!s.ok()) { + ROCKS_LOG_ERROR( + db_options_.info_log, + "[%s] [JOB %d] OpenCompactionOutputFiles for table #%" PRIu64 + " fails at NewWritableFile with status %s", + sub_compact->compaction->column_family_data()->GetName().c_str(), + job_id_, file_number, s.ToString().c_str()); + LogFlush(db_options_.info_log); + EventHelpers::LogAndNotifyTableFileCreationFinished( + event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(), + fname, job_id_, FileDescriptor(), kInvalidBlobFileNumber, + TableProperties(), TableFileCreationReason::kCompaction, s); + return s; + } + + // Try to figure out the output file's oldest ancester time. + int64_t temp_current_time = 0; + auto get_time_status = env_->GetCurrentTime(&temp_current_time); + // Safe to proceed even if GetCurrentTime fails. So, log and proceed. + if (!get_time_status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to get current time. Status: %s", + get_time_status.ToString().c_str()); + } + uint64_t current_time = static_cast(temp_current_time); + uint64_t oldest_ancester_time = + sub_compact->compaction->MinInputFileOldestAncesterTime(); + if (oldest_ancester_time == port::kMaxUint64) { + oldest_ancester_time = current_time; + } + + // Initialize a SubcompactionState::Output and add it to sub_compact->outputs + { + SubcompactionState::Output out; + out.meta.fd = FileDescriptor(file_number, + sub_compact->compaction->output_path_id(), 0); + out.meta.oldest_ancester_time = oldest_ancester_time; + out.meta.file_creation_time = current_time; + out.finished = false; + sub_compact->outputs.push_back(out); + } + + writable_file->SetIOPriority(Env::IOPriority::IO_LOW); + writable_file->SetWriteLifeTimeHint(write_hint_); + writable_file->SetPreallocationBlockSize(static_cast( + sub_compact->compaction->OutputFilePreallocationSize())); + const auto& listeners = + sub_compact->compaction->immutable_cf_options()->listeners; + sub_compact->outfile.reset( + new WritableFileWriter(std::move(writable_file), fname, file_options_, + env_, db_options_.statistics.get(), listeners, + db_options_.sst_file_checksum_func.get())); + + // If the Column family flag is to only optimize filters for hits, + // we can skip creating filters if this is the bottommost_level where + // data is going to be found + bool skip_filters = + cfd->ioptions()->optimize_filters_for_hits && bottommost_level_; + + sub_compact->builder.reset(NewTableBuilder( + *cfd->ioptions(), *(sub_compact->compaction->mutable_cf_options()), + cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(), + cfd->GetID(), cfd->GetName(), sub_compact->outfile.get(), + sub_compact->compaction->output_compression(), + 0 /*sample_for_compression */, + sub_compact->compaction->output_compression_opts(), + sub_compact->compaction->output_level(), skip_filters, + oldest_ancester_time, 0 /* oldest_key_time */, + sub_compact->compaction->max_output_file_size(), current_time)); + LogFlush(db_options_.info_log); + return s; +} + +void CompactionJob::CleanupCompaction() { + for (SubcompactionState& sub_compact : compact_->sub_compact_states) { + const auto& sub_status = sub_compact.status; + + if (sub_compact.builder != nullptr) { + // May happen if we get a shutdown call in the middle of compaction + sub_compact.builder->Abandon(); + sub_compact.builder.reset(); + } else { + assert(!sub_status.ok() || sub_compact.outfile == nullptr); + } + for (const auto& out : sub_compact.outputs) { + // If this file was inserted into the table cache then remove + // them here because this compaction was not committed. + if (!sub_status.ok()) { + TableCache::Evict(table_cache_.get(), out.meta.fd.GetNumber()); + } + } + } + delete compact_; + compact_ = nullptr; +} + +#ifndef ROCKSDB_LITE +namespace { +void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) { + assert(prefix_length > 0); + size_t length = src.size() > prefix_length ? prefix_length : src.size(); + dst->assign(src.data(), length); +} +} // namespace + +#endif // !ROCKSDB_LITE + +void CompactionJob::UpdateCompactionStats() { + Compaction* compaction = compact_->compaction; + compaction_stats_.num_input_files_in_non_output_levels = 0; + compaction_stats_.num_input_files_in_output_level = 0; + for (int input_level = 0; + input_level < static_cast(compaction->num_input_levels()); + ++input_level) { + if (compaction->level(input_level) != compaction->output_level()) { + UpdateCompactionInputStatsHelper( + &compaction_stats_.num_input_files_in_non_output_levels, + &compaction_stats_.bytes_read_non_output_levels, input_level); + } else { + UpdateCompactionInputStatsHelper( + &compaction_stats_.num_input_files_in_output_level, + &compaction_stats_.bytes_read_output_level, input_level); + } + } + + uint64_t num_output_records = 0; + + for (const auto& sub_compact : compact_->sub_compact_states) { + size_t num_output_files = sub_compact.outputs.size(); + if (sub_compact.builder != nullptr) { + // An error occurred so ignore the last output. + assert(num_output_files > 0); + --num_output_files; + } + compaction_stats_.num_output_files += static_cast(num_output_files); + + num_output_records += sub_compact.num_output_records; + + for (const auto& out : sub_compact.outputs) { + compaction_stats_.bytes_written += out.meta.fd.file_size; + } + } + + if (compaction_stats_.num_input_records > num_output_records) { + compaction_stats_.num_dropped_records = + compaction_stats_.num_input_records - num_output_records; + } +} + +void CompactionJob::UpdateCompactionInputStatsHelper(int* num_files, + uint64_t* bytes_read, + int input_level) { + const Compaction* compaction = compact_->compaction; + auto num_input_files = compaction->num_input_files(input_level); + *num_files += static_cast(num_input_files); + + for (size_t i = 0; i < num_input_files; ++i) { + const auto* file_meta = compaction->input(input_level, i); + *bytes_read += file_meta->fd.GetFileSize(); + compaction_stats_.num_input_records += + static_cast(file_meta->num_entries); + } +} + +void CompactionJob::UpdateCompactionJobStats( + const InternalStats::CompactionStats& stats) const { +#ifndef ROCKSDB_LITE + if (compaction_job_stats_) { + compaction_job_stats_->elapsed_micros = stats.micros; + + // input information + compaction_job_stats_->total_input_bytes = + stats.bytes_read_non_output_levels + stats.bytes_read_output_level; + compaction_job_stats_->num_input_records = stats.num_input_records; + compaction_job_stats_->num_input_files = + stats.num_input_files_in_non_output_levels + + stats.num_input_files_in_output_level; + compaction_job_stats_->num_input_files_at_output_level = + stats.num_input_files_in_output_level; + + // output information + compaction_job_stats_->total_output_bytes = stats.bytes_written; + compaction_job_stats_->num_output_records = compact_->num_output_records; + compaction_job_stats_->num_output_files = stats.num_output_files; + + if (compact_->NumOutputFiles() > 0U) { + CopyPrefix(compact_->SmallestUserKey(), + CompactionJobStats::kMaxPrefixLength, + &compaction_job_stats_->smallest_output_key_prefix); + CopyPrefix(compact_->LargestUserKey(), + CompactionJobStats::kMaxPrefixLength, + &compaction_job_stats_->largest_output_key_prefix); + } + } +#else + (void)stats; +#endif // !ROCKSDB_LITE +} + +void CompactionJob::LogCompaction() { + Compaction* compaction = compact_->compaction; + ColumnFamilyData* cfd = compaction->column_family_data(); + + // Let's check if anything will get logged. Don't prepare all the info if + // we're not logging + if (db_options_.info_log_level <= InfoLogLevel::INFO_LEVEL) { + Compaction::InputLevelSummaryBuffer inputs_summary; + ROCKS_LOG_INFO( + db_options_.info_log, "[%s] [JOB %d] Compacting %s, score %.2f", + cfd->GetName().c_str(), job_id_, + compaction->InputLevelSummary(&inputs_summary), compaction->score()); + char scratch[2345]; + compaction->Summary(scratch, sizeof(scratch)); + ROCKS_LOG_INFO(db_options_.info_log, "[%s] Compaction start summary: %s\n", + cfd->GetName().c_str(), scratch); + // build event logger report + auto stream = event_logger_->Log(); + stream << "job" << job_id_ << "event" + << "compaction_started" + << "compaction_reason" + << GetCompactionReasonString(compaction->compaction_reason()); + for (size_t i = 0; i < compaction->num_input_levels(); ++i) { + stream << ("files_L" + ToString(compaction->level(i))); + stream.StartArray(); + for (auto f : *compaction->inputs(i)) { + stream << f->fd.GetNumber(); + } + stream.EndArray(); + } + stream << "score" << compaction->score() << "input_data_size" + << compaction->CalculateTotalInputSize(); + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_job.h b/src/rocksdb/db/compaction/compaction_job.h new file mode 100644 index 000000000..c15f502a1 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_job.h @@ -0,0 +1,198 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/column_family.h" +#include "db/compaction/compaction_iterator.h" +#include "db/dbformat.h" +#include "db/flush_scheduler.h" +#include "db/internal_stats.h" +#include "db/job_context.h" +#include "db/log_writer.h" +#include "db/memtable_list.h" +#include "db/range_del_aggregator.h" +#include "db/version_edit.h" +#include "db/write_controller.h" +#include "db/write_thread.h" +#include "logging/event_logger.h" +#include "options/cf_options.h" +#include "options/db_options.h" +#include "port/port.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/compaction_job_stats.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/transaction_log.h" +#include "table/scoped_arena_iterator.h" +#include "util/autovector.h" +#include "util/stop_watch.h" +#include "util/thread_local.h" + +namespace ROCKSDB_NAMESPACE { + +class Arena; +class ErrorHandler; +class MemTable; +class SnapshotChecker; +class TableCache; +class Version; +class VersionEdit; +class VersionSet; + +// CompactionJob is responsible for executing the compaction. Each (manual or +// automated) compaction corresponds to a CompactionJob object, and usually +// goes through the stages of `Prepare()`->`Run()`->`Install()`. CompactionJob +// will divide the compaction into subcompactions and execute them in parallel +// if needed. +class CompactionJob { + public: + CompactionJob(int job_id, Compaction* compaction, + const ImmutableDBOptions& db_options, + const FileOptions& file_options, VersionSet* versions, + const std::atomic* shutting_down, + const SequenceNumber preserve_deletes_seqnum, + LogBuffer* log_buffer, Directory* db_directory, + Directory* output_directory, Statistics* stats, + InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler, + std::vector existing_snapshots, + SequenceNumber earliest_write_conflict_snapshot, + const SnapshotChecker* snapshot_checker, + std::shared_ptr table_cache, EventLogger* event_logger, + bool paranoid_file_checks, bool measure_io_stats, + const std::string& dbname, + CompactionJobStats* compaction_job_stats, + Env::Priority thread_pri, + const std::atomic* manual_compaction_paused = nullptr); + + ~CompactionJob(); + + // no copy/move + CompactionJob(CompactionJob&& job) = delete; + CompactionJob(const CompactionJob& job) = delete; + CompactionJob& operator=(const CompactionJob& job) = delete; + + // REQUIRED: mutex held + // Prepare for the compaction by setting up boundaries for each subcompaction + void Prepare(); + // REQUIRED mutex not held + // Launch threads for each subcompaction and wait for them to finish. After + // that, verify table is usable and finally do bookkeeping to unify + // subcompaction results + Status Run(); + + // REQUIRED: mutex held + // Add compaction input/output to the current version + Status Install(const MutableCFOptions& mutable_cf_options); + + private: + struct SubcompactionState; + + void AggregateStatistics(); + + // Generates a histogram representing potential divisions of key ranges from + // the input. It adds the starting and/or ending keys of certain input files + // to the working set and then finds the approximate size of data in between + // each consecutive pair of slices. Then it divides these ranges into + // consecutive groups such that each group has a similar size. + void GenSubcompactionBoundaries(); + + // update the thread status for starting a compaction. + void ReportStartedCompaction(Compaction* compaction); + void AllocateCompactionOutputFileNumbers(); + // Call compaction filter. Then iterate through input and compact the + // kv-pairs + void ProcessKeyValueCompaction(SubcompactionState* sub_compact); + + Status FinishCompactionOutputFile( + const Status& input_status, SubcompactionState* sub_compact, + CompactionRangeDelAggregator* range_del_agg, + CompactionIterationStats* range_del_out_stats, + const Slice* next_table_min_key = nullptr); + Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options); + void RecordCompactionIOStats(); + Status OpenCompactionOutputFile(SubcompactionState* sub_compact); + void CleanupCompaction(); + void UpdateCompactionJobStats( + const InternalStats::CompactionStats& stats) const; + void RecordDroppedKeys(const CompactionIterationStats& c_iter_stats, + CompactionJobStats* compaction_job_stats = nullptr); + + void UpdateCompactionStats(); + void UpdateCompactionInputStatsHelper( + int* num_files, uint64_t* bytes_read, int input_level); + + void LogCompaction(); + + int job_id_; + + // CompactionJob state + struct CompactionState; + CompactionState* compact_; + CompactionJobStats* compaction_job_stats_; + InternalStats::CompactionStats compaction_stats_; + + // DBImpl state + const std::string& dbname_; + const ImmutableDBOptions& db_options_; + const FileOptions file_options_; + + Env* env_; + FileSystem* fs_; + // env_option optimized for compaction table reads + FileOptions file_options_for_read_; + VersionSet* versions_; + const std::atomic* shutting_down_; + const std::atomic* manual_compaction_paused_; + const SequenceNumber preserve_deletes_seqnum_; + LogBuffer* log_buffer_; + Directory* db_directory_; + Directory* output_directory_; + Statistics* stats_; + InstrumentedMutex* db_mutex_; + ErrorHandler* db_error_handler_; + // If there were two snapshots with seq numbers s1 and + // s2 and s1 < s2, and if we find two instances of a key k1 then lies + // entirely within s1 and s2, then the earlier version of k1 can be safely + // deleted because that version is not visible in any snapshot. + std::vector existing_snapshots_; + + // This is the earliest snapshot that could be used for write-conflict + // checking by a transaction. For any user-key newer than this snapshot, we + // should make sure not to remove evidence that a write occurred. + SequenceNumber earliest_write_conflict_snapshot_; + + const SnapshotChecker* const snapshot_checker_; + + std::shared_ptr table_cache_; + + EventLogger* event_logger_; + + // Is this compaction creating a file in the bottom most level? + bool bottommost_level_; + bool paranoid_file_checks_; + bool measure_io_stats_; + // Stores the Slices that designate the boundaries for each subcompaction + std::vector boundaries_; + // Stores the approx size of keys covered in the range of each subcompaction + std::vector sizes_; + Env::WriteLifeTimeHint write_hint_; + Env::Priority thread_pri_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_job_stats_test.cc b/src/rocksdb/db/compaction/compaction_job_stats_test.cc new file mode 100644 index 000000000..51a665797 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_job_stats_test.cc @@ -0,0 +1,1043 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "db/dbformat.h" +#include "db/job_context.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "env/mock_env.h" +#include "file/filename.h" +#include "logging/logging.h" +#include "memtable/hash_linklist_rep.h" +#include "monitoring/statistics.h" +#include "monitoring/thread_status_util.h" +#include "port/stack_trace.h" +#include "rocksdb/cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/convenience.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/experimental.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/options.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "rocksdb/thread_status.h" +#include "rocksdb/utilities/checkpoint.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/mock_table.h" +#include "table/plain/plain_table_factory.h" +#include "table/scoped_arena_iterator.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/compression.h" +#include "util/hash.h" +#include "util/mutexlock.h" +#include "util/rate_limiter.h" +#include "util/string_util.h" +#include "utilities/merge_operators.h" + +#if !defined(IOS_CROSS_COMPILE) +#ifndef ROCKSDB_LITE +namespace ROCKSDB_NAMESPACE { + +static std::string RandomString(Random* rnd, int len, double ratio) { + std::string r; + test::CompressibleString(rnd, ratio, len, &r); + return r; +} + +std::string Key(uint64_t key, int length) { + const int kBufSize = 1000; + char buf[kBufSize]; + if (length > kBufSize) { + length = kBufSize; + } + snprintf(buf, kBufSize, "%0*" PRIu64, length, key); + return std::string(buf); +} + +class CompactionJobStatsTest : public testing::Test, + public testing::WithParamInterface { + public: + std::string dbname_; + std::string alternative_wal_dir_; + Env* env_; + DB* db_; + std::vector handles_; + uint32_t max_subcompactions_; + + Options last_options_; + + CompactionJobStatsTest() : env_(Env::Default()) { + env_->SetBackgroundThreads(1, Env::LOW); + env_->SetBackgroundThreads(1, Env::HIGH); + dbname_ = test::PerThreadDBPath("compaction_job_stats_test"); + alternative_wal_dir_ = dbname_ + "/wal"; + Options options; + options.create_if_missing = true; + max_subcompactions_ = GetParam(); + options.max_subcompactions = max_subcompactions_; + auto delete_options = options; + delete_options.wal_dir = alternative_wal_dir_; + EXPECT_OK(DestroyDB(dbname_, delete_options)); + // Destroy it for not alternative WAL dir is used. + EXPECT_OK(DestroyDB(dbname_, options)); + db_ = nullptr; + Reopen(options); + } + + ~CompactionJobStatsTest() override { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + Close(); + Options options; + options.db_paths.emplace_back(dbname_, 0); + options.db_paths.emplace_back(dbname_ + "_2", 0); + options.db_paths.emplace_back(dbname_ + "_3", 0); + options.db_paths.emplace_back(dbname_ + "_4", 0); + EXPECT_OK(DestroyDB(dbname_, options)); + } + + // Required if inheriting from testing::WithParamInterface<> + static void SetUpTestCase() {} + static void TearDownTestCase() {} + + DBImpl* dbfull() { + return reinterpret_cast(db_); + } + + void CreateColumnFamilies(const std::vector& cfs, + const Options& options) { + ColumnFamilyOptions cf_opts(options); + size_t cfi = handles_.size(); + handles_.resize(cfi + cfs.size()); + for (auto cf : cfs) { + ASSERT_OK(db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++])); + } + } + + void CreateAndReopenWithCF(const std::vector& cfs, + const Options& options) { + CreateColumnFamilies(cfs, options); + std::vector cfs_plus_default = cfs; + cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName); + ReopenWithColumnFamilies(cfs_plus_default, options); + } + + void ReopenWithColumnFamilies(const std::vector& cfs, + const std::vector& options) { + ASSERT_OK(TryReopenWithColumnFamilies(cfs, options)); + } + + void ReopenWithColumnFamilies(const std::vector& cfs, + const Options& options) { + ASSERT_OK(TryReopenWithColumnFamilies(cfs, options)); + } + + Status TryReopenWithColumnFamilies( + const std::vector& cfs, + const std::vector& options) { + Close(); + EXPECT_EQ(cfs.size(), options.size()); + std::vector column_families; + for (size_t i = 0; i < cfs.size(); ++i) { + column_families.push_back(ColumnFamilyDescriptor(cfs[i], options[i])); + } + DBOptions db_opts = DBOptions(options[0]); + return DB::Open(db_opts, dbname_, column_families, &handles_, &db_); + } + + Status TryReopenWithColumnFamilies(const std::vector& cfs, + const Options& options) { + Close(); + std::vector v_opts(cfs.size(), options); + return TryReopenWithColumnFamilies(cfs, v_opts); + } + + void Reopen(const Options& options) { + ASSERT_OK(TryReopen(options)); + } + + void Close() { + for (auto h : handles_) { + delete h; + } + handles_.clear(); + delete db_; + db_ = nullptr; + } + + void DestroyAndReopen(const Options& options) { + // Destroy using last options + Destroy(last_options_); + ASSERT_OK(TryReopen(options)); + } + + void Destroy(const Options& options) { + Close(); + ASSERT_OK(DestroyDB(dbname_, options)); + } + + Status ReadOnlyReopen(const Options& options) { + return DB::OpenForReadOnly(options, dbname_, &db_); + } + + Status TryReopen(const Options& options) { + Close(); + last_options_ = options; + return DB::Open(options, dbname_, &db_); + } + + Status Flush(int cf = 0) { + if (cf == 0) { + return db_->Flush(FlushOptions()); + } else { + return db_->Flush(FlushOptions(), handles_[cf]); + } + } + + Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions()) { + return db_->Put(wo, k, v); + } + + Status Put(int cf, const Slice& k, const Slice& v, + WriteOptions wo = WriteOptions()) { + return db_->Put(wo, handles_[cf], k, v); + } + + Status Delete(const std::string& k) { + return db_->Delete(WriteOptions(), k); + } + + Status Delete(int cf, const std::string& k) { + return db_->Delete(WriteOptions(), handles_[cf], k); + } + + std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) { + ReadOptions options; + options.verify_checksums = true; + options.snapshot = snapshot; + std::string result; + Status s = db_->Get(options, k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + std::string Get(int cf, const std::string& k, + const Snapshot* snapshot = nullptr) { + ReadOptions options; + options.verify_checksums = true; + options.snapshot = snapshot; + std::string result; + Status s = db_->Get(options, handles_[cf], k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + int NumTableFilesAtLevel(int level, int cf = 0) { + std::string property; + if (cf == 0) { + // default cfd + EXPECT_TRUE(db_->GetProperty( + "rocksdb.num-files-at-level" + NumberToString(level), &property)); + } else { + EXPECT_TRUE(db_->GetProperty( + handles_[cf], "rocksdb.num-files-at-level" + NumberToString(level), + &property)); + } + return atoi(property.c_str()); + } + + // Return spread of files per level + std::string FilesPerLevel(int cf = 0) { + int num_levels = + (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[1]); + std::string result; + size_t last_non_zero_offset = 0; + for (int level = 0; level < num_levels; level++) { + int f = NumTableFilesAtLevel(level, cf); + char buf[100]; + snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f); + result += buf; + if (f > 0) { + last_non_zero_offset = result.size(); + } + } + result.resize(last_non_zero_offset); + return result; + } + + uint64_t Size(const Slice& start, const Slice& limit, int cf = 0) { + Range r(start, limit); + uint64_t size; + if (cf == 0) { + db_->GetApproximateSizes(&r, 1, &size); + } else { + db_->GetApproximateSizes(handles_[1], &r, 1, &size); + } + return size; + } + + void Compact(int cf, const Slice& start, const Slice& limit, + uint32_t target_path_id) { + CompactRangeOptions compact_options; + compact_options.target_path_id = target_path_id; + ASSERT_OK(db_->CompactRange(compact_options, handles_[cf], &start, &limit)); + } + + void Compact(int cf, const Slice& start, const Slice& limit) { + ASSERT_OK( + db_->CompactRange(CompactRangeOptions(), handles_[cf], &start, &limit)); + } + + void Compact(const Slice& start, const Slice& limit) { + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &limit)); + } + + void TEST_Compact(int level, int cf, const Slice& start, const Slice& limit) { + ASSERT_OK(dbfull()->TEST_CompactRange(level, &start, &limit, handles_[cf], + true /* disallow trivial move */)); + } + + // Do n memtable compactions, each of which produces an sstable + // covering the range [small,large]. + void MakeTables(int n, const std::string& small, const std::string& large, + int cf = 0) { + for (int i = 0; i < n; i++) { + ASSERT_OK(Put(cf, small, "begin")); + ASSERT_OK(Put(cf, large, "end")); + ASSERT_OK(Flush(cf)); + } + } + + static void SetDeletionCompactionStats( + CompactionJobStats *stats, uint64_t input_deletions, + uint64_t expired_deletions, uint64_t records_replaced) { + stats->num_input_deletion_records = input_deletions; + stats->num_expired_deletion_records = expired_deletions; + stats->num_records_replaced = records_replaced; + } + + void MakeTableWithKeyValues( + Random* rnd, uint64_t smallest, uint64_t largest, + int key_size, int value_size, uint64_t interval, + double ratio, int cf = 0) { + for (auto key = smallest; key < largest; key += interval) { + ASSERT_OK(Put(cf, Slice(Key(key, key_size)), + Slice(RandomString(rnd, value_size, ratio)))); + } + ASSERT_OK(Flush(cf)); + } + + // This function behaves with the implicit understanding that two + // rounds of keys are inserted into the database, as per the behavior + // of the DeletionStatsTest. + void SelectivelyDeleteKeys(uint64_t smallest, uint64_t largest, + uint64_t interval, int deletion_interval, int key_size, + uint64_t cutoff_key_num, CompactionJobStats* stats, int cf = 0) { + + // interval needs to be >= 2 so that deletion entries can be inserted + // that are intended to not result in an actual key deletion by using + // an offset of 1 from another existing key + ASSERT_GE(interval, 2); + + uint64_t ctr = 1; + uint32_t deletions_made = 0; + uint32_t num_deleted = 0; + uint32_t num_expired = 0; + for (auto key = smallest; key <= largest; key += interval, ctr++) { + if (ctr % deletion_interval == 0) { + ASSERT_OK(Delete(cf, Key(key, key_size))); + deletions_made++; + num_deleted++; + + if (key > cutoff_key_num) { + num_expired++; + } + } + } + + // Insert some deletions for keys that don't exist that + // are both in and out of the key range + ASSERT_OK(Delete(cf, Key(smallest+1, key_size))); + deletions_made++; + + ASSERT_OK(Delete(cf, Key(smallest-1, key_size))); + deletions_made++; + num_expired++; + + ASSERT_OK(Delete(cf, Key(smallest-9, key_size))); + deletions_made++; + num_expired++; + + ASSERT_OK(Flush(cf)); + SetDeletionCompactionStats(stats, deletions_made, num_expired, + num_deleted); + } +}; + +// An EventListener which helps verify the compaction results in +// test CompactionJobStatsTest. +class CompactionJobStatsChecker : public EventListener { + public: + CompactionJobStatsChecker() + : compression_enabled_(false), verify_next_comp_io_stats_(false) {} + + size_t NumberOfUnverifiedStats() { return expected_stats_.size(); } + + void set_verify_next_comp_io_stats(bool v) { verify_next_comp_io_stats_ = v; } + + // Once a compaction completed, this function will verify the returned + // CompactionJobInfo with the oldest CompactionJobInfo added earlier + // in "expected_stats_" which has not yet being used for verification. + void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override { + if (verify_next_comp_io_stats_) { + ASSERT_GT(ci.stats.file_write_nanos, 0); + ASSERT_GT(ci.stats.file_range_sync_nanos, 0); + ASSERT_GT(ci.stats.file_fsync_nanos, 0); + ASSERT_GT(ci.stats.file_prepare_write_nanos, 0); + verify_next_comp_io_stats_ = false; + } + + std::lock_guard lock(mutex_); + if (expected_stats_.size()) { + Verify(ci.stats, expected_stats_.front()); + expected_stats_.pop(); + } + } + + // A helper function which verifies whether two CompactionJobStats + // match. The verification of all compaction stats are done by + // ASSERT_EQ except for the total input / output bytes, which we + // use ASSERT_GE and ASSERT_LE with a reasonable bias --- + // 10% in uncompressed case and 20% when compression is used. + virtual void Verify(const CompactionJobStats& current_stats, + const CompactionJobStats& stats) { + // time + ASSERT_GT(current_stats.elapsed_micros, 0U); + + ASSERT_EQ(current_stats.num_input_records, + stats.num_input_records); + ASSERT_EQ(current_stats.num_input_files, + stats.num_input_files); + ASSERT_EQ(current_stats.num_input_files_at_output_level, + stats.num_input_files_at_output_level); + + ASSERT_EQ(current_stats.num_output_records, + stats.num_output_records); + ASSERT_EQ(current_stats.num_output_files, + stats.num_output_files); + + ASSERT_EQ(current_stats.is_manual_compaction, + stats.is_manual_compaction); + + // file size + double kFileSizeBias = compression_enabled_ ? 0.20 : 0.10; + ASSERT_GE(current_stats.total_input_bytes * (1.00 + kFileSizeBias), + stats.total_input_bytes); + ASSERT_LE(current_stats.total_input_bytes, + stats.total_input_bytes * (1.00 + kFileSizeBias)); + ASSERT_GE(current_stats.total_output_bytes * (1.00 + kFileSizeBias), + stats.total_output_bytes); + ASSERT_LE(current_stats.total_output_bytes, + stats.total_output_bytes * (1.00 + kFileSizeBias)); + ASSERT_EQ(current_stats.total_input_raw_key_bytes, + stats.total_input_raw_key_bytes); + ASSERT_EQ(current_stats.total_input_raw_value_bytes, + stats.total_input_raw_value_bytes); + + ASSERT_EQ(current_stats.num_records_replaced, + stats.num_records_replaced); + + ASSERT_EQ(current_stats.num_corrupt_keys, + stats.num_corrupt_keys); + + ASSERT_EQ( + std::string(current_stats.smallest_output_key_prefix), + std::string(stats.smallest_output_key_prefix)); + ASSERT_EQ( + std::string(current_stats.largest_output_key_prefix), + std::string(stats.largest_output_key_prefix)); + } + + // Add an expected compaction stats, which will be used to + // verify the CompactionJobStats returned by the OnCompactionCompleted() + // callback. + void AddExpectedStats(const CompactionJobStats& stats) { + std::lock_guard lock(mutex_); + expected_stats_.push(stats); + } + + void EnableCompression(bool flag) { + compression_enabled_ = flag; + } + + bool verify_next_comp_io_stats() const { return verify_next_comp_io_stats_; } + + private: + std::mutex mutex_; + std::queue expected_stats_; + bool compression_enabled_; + bool verify_next_comp_io_stats_; +}; + +// An EventListener which helps verify the compaction statistics in +// the test DeletionStatsTest. +class CompactionJobDeletionStatsChecker : public CompactionJobStatsChecker { + public: + // Verifies whether two CompactionJobStats match. + void Verify(const CompactionJobStats& current_stats, + const CompactionJobStats& stats) override { + ASSERT_EQ( + current_stats.num_input_deletion_records, + stats.num_input_deletion_records); + ASSERT_EQ( + current_stats.num_expired_deletion_records, + stats.num_expired_deletion_records); + ASSERT_EQ( + current_stats.num_records_replaced, + stats.num_records_replaced); + + ASSERT_EQ(current_stats.num_corrupt_keys, + stats.num_corrupt_keys); + } +}; + +namespace { + +uint64_t EstimatedFileSize( + uint64_t num_records, size_t key_size, size_t value_size, + double compression_ratio = 1.0, + size_t block_size = 4096, + int bloom_bits_per_key = 10) { + const size_t kPerKeyOverhead = 8; + const size_t kFooterSize = 512; + + uint64_t data_size = + static_cast( + num_records * (key_size + value_size * compression_ratio + + kPerKeyOverhead)); + + return data_size + kFooterSize + + num_records * bloom_bits_per_key / 8 // filter block + + data_size * (key_size + 8) / block_size; // index block +} + +namespace { + +void CopyPrefix( + const Slice& src, size_t prefix_length, std::string* dst) { + assert(prefix_length > 0); + size_t length = src.size() > prefix_length ? prefix_length : src.size(); + dst->assign(src.data(), length); +} + +} // namespace + +CompactionJobStats NewManualCompactionJobStats( + const std::string& smallest_key, const std::string& largest_key, + size_t num_input_files, size_t num_input_files_at_output_level, + uint64_t num_input_records, size_t key_size, size_t value_size, + size_t num_output_files, uint64_t num_output_records, + double compression_ratio, uint64_t num_records_replaced, + bool is_manual = true) { + CompactionJobStats stats; + stats.Reset(); + + stats.num_input_records = num_input_records; + stats.num_input_files = num_input_files; + stats.num_input_files_at_output_level = num_input_files_at_output_level; + + stats.num_output_records = num_output_records; + stats.num_output_files = num_output_files; + + stats.total_input_bytes = + EstimatedFileSize( + num_input_records / num_input_files, + key_size, value_size, compression_ratio) * num_input_files; + stats.total_output_bytes = + EstimatedFileSize( + num_output_records / num_output_files, + key_size, value_size, compression_ratio) * num_output_files; + stats.total_input_raw_key_bytes = + num_input_records * (key_size + 8); + stats.total_input_raw_value_bytes = + num_input_records * value_size; + + stats.is_manual_compaction = is_manual; + + stats.num_records_replaced = num_records_replaced; + + CopyPrefix(smallest_key, + CompactionJobStats::kMaxPrefixLength, + &stats.smallest_output_key_prefix); + CopyPrefix(largest_key, + CompactionJobStats::kMaxPrefixLength, + &stats.largest_output_key_prefix); + + return stats; +} + +CompressionType GetAnyCompression() { + if (Snappy_Supported()) { + return kSnappyCompression; + } else if (Zlib_Supported()) { + return kZlibCompression; + } else if (BZip2_Supported()) { + return kBZip2Compression; + } else if (LZ4_Supported()) { + return kLZ4Compression; + } else if (XPRESS_Supported()) { + return kXpressCompression; + } + + return kNoCompression; +} + +} // namespace + +TEST_P(CompactionJobStatsTest, CompactionJobStatsTest) { + Random rnd(301); + const int kBufSize = 100; + char buf[kBufSize]; + uint64_t key_base = 100000000l; + // Note: key_base must be multiple of num_keys_per_L0_file + int num_keys_per_L0_file = 100; + const int kTestScale = 8; + const int kKeySize = 10; + const int kValueSize = 1000; + const double kCompressionRatio = 0.5; + double compression_ratio = 1.0; + uint64_t key_interval = key_base / num_keys_per_L0_file; + + // Whenever a compaction completes, this listener will try to + // verify whether the returned CompactionJobStats matches + // what we expect. The expected CompactionJobStats is added + // via AddExpectedStats(). + auto* stats_checker = new CompactionJobStatsChecker(); + Options options; + options.listeners.emplace_back(stats_checker); + options.create_if_missing = true; + // just enough setting to hold off auto-compaction. + options.level0_file_num_compaction_trigger = kTestScale + 1; + options.num_levels = 3; + options.compression = kNoCompression; + options.max_subcompactions = max_subcompactions_; + options.bytes_per_sync = 512 * 1024; + + options.report_bg_io_stats = true; + for (int test = 0; test < 2; ++test) { + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // 1st Phase: generate "num_L0_files" L0 files. + int num_L0_files = 0; + for (uint64_t start_key = key_base; + start_key <= key_base * kTestScale; + start_key += key_base) { + MakeTableWithKeyValues( + &rnd, start_key, start_key + key_base - 1, + kKeySize, kValueSize, key_interval, + compression_ratio, 1); + snprintf(buf, kBufSize, "%d", ++num_L0_files); + ASSERT_EQ(std::string(buf), FilesPerLevel(1)); + } + ASSERT_EQ(ToString(num_L0_files), FilesPerLevel(1)); + + // 2nd Phase: perform L0 -> L1 compaction. + int L0_compaction_count = 6; + int count = 1; + std::string smallest_key; + std::string largest_key; + for (uint64_t start_key = key_base; + start_key <= key_base * L0_compaction_count; + start_key += key_base, count++) { + smallest_key = Key(start_key, 10); + largest_key = Key(start_key + key_base - key_interval, 10); + stats_checker->AddExpectedStats( + NewManualCompactionJobStats( + smallest_key, largest_key, + 1, 0, num_keys_per_L0_file, + kKeySize, kValueSize, + 1, num_keys_per_L0_file, + compression_ratio, 0)); + ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U); + TEST_Compact(0, 1, smallest_key, largest_key); + snprintf(buf, kBufSize, "%d,%d", num_L0_files - count, count); + ASSERT_EQ(std::string(buf), FilesPerLevel(1)); + } + + // compact two files into one in the last L0 -> L1 compaction + int num_remaining_L0 = num_L0_files - L0_compaction_count; + smallest_key = Key(key_base * (L0_compaction_count + 1), 10); + largest_key = Key(key_base * (kTestScale + 1) - key_interval, 10); + stats_checker->AddExpectedStats( + NewManualCompactionJobStats( + smallest_key, largest_key, + num_remaining_L0, + 0, num_keys_per_L0_file * num_remaining_L0, + kKeySize, kValueSize, + 1, num_keys_per_L0_file * num_remaining_L0, + compression_ratio, 0)); + ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U); + TEST_Compact(0, 1, smallest_key, largest_key); + + int num_L1_files = num_L0_files - num_remaining_L0 + 1; + num_L0_files = 0; + snprintf(buf, kBufSize, "%d,%d", num_L0_files, num_L1_files); + ASSERT_EQ(std::string(buf), FilesPerLevel(1)); + + // 3rd Phase: generate sparse L0 files (wider key-range, same num of keys) + int sparseness = 2; + for (uint64_t start_key = key_base; + start_key <= key_base * kTestScale; + start_key += key_base * sparseness) { + MakeTableWithKeyValues( + &rnd, start_key, start_key + key_base * sparseness - 1, + kKeySize, kValueSize, + key_base * sparseness / num_keys_per_L0_file, + compression_ratio, 1); + snprintf(buf, kBufSize, "%d,%d", ++num_L0_files, num_L1_files); + ASSERT_EQ(std::string(buf), FilesPerLevel(1)); + } + + // 4th Phase: perform L0 -> L1 compaction again, expect higher write amp + // When subcompactions are enabled, the number of output files increases + // by 1 because multiple threads are consuming the input and generating + // output files without coordinating to see if the output could fit into + // a smaller number of files like it does when it runs sequentially + int num_output_files = options.max_subcompactions > 1 ? 2 : 1; + for (uint64_t start_key = key_base; + num_L0_files > 1; + start_key += key_base * sparseness) { + smallest_key = Key(start_key, 10); + largest_key = + Key(start_key + key_base * sparseness - key_interval, 10); + stats_checker->AddExpectedStats( + NewManualCompactionJobStats( + smallest_key, largest_key, + 3, 2, num_keys_per_L0_file * 3, + kKeySize, kValueSize, + num_output_files, + num_keys_per_L0_file * 2, // 1/3 of the data will be updated. + compression_ratio, + num_keys_per_L0_file)); + ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U); + Compact(1, smallest_key, largest_key); + if (options.max_subcompactions == 1) { + --num_L1_files; + } + snprintf(buf, kBufSize, "%d,%d", --num_L0_files, num_L1_files); + ASSERT_EQ(std::string(buf), FilesPerLevel(1)); + } + + // 5th Phase: Do a full compaction, which involves in two sub-compactions. + // Here we expect to have 1 L0 files and 4 L1 files + // In the first sub-compaction, we expect L0 compaction. + smallest_key = Key(key_base, 10); + largest_key = Key(key_base * (kTestScale + 1) - key_interval, 10); + stats_checker->AddExpectedStats( + NewManualCompactionJobStats( + Key(key_base * (kTestScale + 1 - sparseness), 10), largest_key, + 2, 1, num_keys_per_L0_file * 3, + kKeySize, kValueSize, + 1, num_keys_per_L0_file * 2, + compression_ratio, + num_keys_per_L0_file)); + ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U); + Compact(1, smallest_key, largest_key); + + num_L1_files = options.max_subcompactions > 1 ? 7 : 4; + char L1_buf[4]; + snprintf(L1_buf, sizeof(L1_buf), "0,%d", num_L1_files); + std::string L1_files(L1_buf); + ASSERT_EQ(L1_files, FilesPerLevel(1)); + options.compression = GetAnyCompression(); + if (options.compression == kNoCompression) { + break; + } + stats_checker->EnableCompression(true); + compression_ratio = kCompressionRatio; + + for (int i = 0; i < 5; i++) { + ASSERT_OK(Put(1, Slice(Key(key_base + i, 10)), + Slice(RandomString(&rnd, 512 * 1024, 1)))); + } + + ASSERT_OK(Flush(1)); + reinterpret_cast(db_)->TEST_WaitForCompact(); + + stats_checker->set_verify_next_comp_io_stats(true); + std::atomic first_prepare_write(true); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WritableFileWriter::Append:BeforePrepareWrite", [&](void* /*arg*/) { + if (first_prepare_write.load()) { + options.env->SleepForMicroseconds(3); + first_prepare_write.store(false); + } + }); + + std::atomic first_flush(true); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WritableFileWriter::Flush:BeforeAppend", [&](void* /*arg*/) { + if (first_flush.load()) { + options.env->SleepForMicroseconds(3); + first_flush.store(false); + } + }); + + std::atomic first_sync(true); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WritableFileWriter::SyncInternal:0", [&](void* /*arg*/) { + if (first_sync.load()) { + options.env->SleepForMicroseconds(3); + first_sync.store(false); + } + }); + + std::atomic first_range_sync(true); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WritableFileWriter::RangeSync:0", [&](void* /*arg*/) { + if (first_range_sync.load()) { + options.env->SleepForMicroseconds(3); + first_range_sync.store(false); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Compact(1, smallest_key, largest_key); + + ASSERT_TRUE(!stats_checker->verify_next_comp_io_stats()); + ASSERT_TRUE(!first_prepare_write.load()); + ASSERT_TRUE(!first_flush.load()); + ASSERT_TRUE(!first_sync.load()); + ASSERT_TRUE(!first_range_sync.load()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } + ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 0U); +} + +TEST_P(CompactionJobStatsTest, DeletionStatsTest) { + Random rnd(301); + uint64_t key_base = 100000l; + // Note: key_base must be multiple of num_keys_per_L0_file + int num_keys_per_L0_file = 20; + const int kTestScale = 8; // make sure this is even + const int kKeySize = 10; + const int kValueSize = 100; + double compression_ratio = 1.0; + uint64_t key_interval = key_base / num_keys_per_L0_file; + uint64_t largest_key_num = key_base * (kTestScale + 1) - key_interval; + uint64_t cutoff_key_num = key_base * (kTestScale / 2 + 1) - key_interval; + const std::string smallest_key = Key(key_base - 10, kKeySize); + const std::string largest_key = Key(largest_key_num + 10, kKeySize); + + // Whenever a compaction completes, this listener will try to + // verify whether the returned CompactionJobStats matches + // what we expect. + auto* stats_checker = new CompactionJobDeletionStatsChecker(); + Options options; + options.listeners.emplace_back(stats_checker); + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = kTestScale+1; + options.num_levels = 3; + options.compression = kNoCompression; + options.max_bytes_for_level_multiplier = 2; + options.max_subcompactions = max_subcompactions_; + + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Stage 1: Generate several L0 files and then send them to L2 by + // using CompactRangeOptions and CompactRange(). These files will + // have a strict subset of the keys from the full key-range + for (uint64_t start_key = key_base; + start_key <= key_base * kTestScale / 2; + start_key += key_base) { + MakeTableWithKeyValues( + &rnd, start_key, start_key + key_base - 1, + kKeySize, kValueSize, key_interval, + compression_ratio, 1); + } + + CompactRangeOptions cr_options; + cr_options.change_level = true; + cr_options.target_level = 2; + db_->CompactRange(cr_options, handles_[1], nullptr, nullptr); + ASSERT_GT(NumTableFilesAtLevel(2, 1), 0); + + // Stage 2: Generate files including keys from the entire key range + for (uint64_t start_key = key_base; + start_key <= key_base * kTestScale; + start_key += key_base) { + MakeTableWithKeyValues( + &rnd, start_key, start_key + key_base - 1, + kKeySize, kValueSize, key_interval, + compression_ratio, 1); + } + + // Send these L0 files to L1 + TEST_Compact(0, 1, smallest_key, largest_key); + ASSERT_GT(NumTableFilesAtLevel(1, 1), 0); + + // Add a new record and flush so now there is a L0 file + // with a value too (not just deletions from the next step) + ASSERT_OK(Put(1, Key(key_base-6, kKeySize), "test")); + ASSERT_OK(Flush(1)); + + // Stage 3: Generate L0 files with some deletions so now + // there are files with the same key range in L0, L1, and L2 + int deletion_interval = 3; + CompactionJobStats first_compaction_stats; + SelectivelyDeleteKeys(key_base, largest_key_num, + key_interval, deletion_interval, kKeySize, cutoff_key_num, + &first_compaction_stats, 1); + + stats_checker->AddExpectedStats(first_compaction_stats); + + // Stage 4: Trigger compaction and verify the stats + TEST_Compact(0, 1, smallest_key, largest_key); +} + +namespace { +int GetUniversalCompactionInputUnits(uint32_t num_flushes) { + uint32_t compaction_input_units; + for (compaction_input_units = 1; + num_flushes >= compaction_input_units; + compaction_input_units *= 2) { + if ((num_flushes & compaction_input_units) != 0) { + return compaction_input_units > 1 ? compaction_input_units : 0; + } + } + return 0; +} +} // namespace + +TEST_P(CompactionJobStatsTest, UniversalCompactionTest) { + Random rnd(301); + uint64_t key_base = 100000000l; + // Note: key_base must be multiple of num_keys_per_L0_file + int num_keys_per_table = 100; + const uint32_t kTestScale = 6; + const int kKeySize = 10; + const int kValueSize = 900; + double compression_ratio = 1.0; + uint64_t key_interval = key_base / num_keys_per_table; + + auto* stats_checker = new CompactionJobStatsChecker(); + Options options; + options.listeners.emplace_back(stats_checker); + options.create_if_missing = true; + options.num_levels = 3; + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = 2; + options.target_file_size_base = num_keys_per_table * 1000; + options.compaction_style = kCompactionStyleUniversal; + options.compaction_options_universal.size_ratio = 1; + options.compaction_options_universal.max_size_amplification_percent = 1000; + options.max_subcompactions = max_subcompactions_; + + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Generates the expected CompactionJobStats for each compaction + for (uint32_t num_flushes = 2; num_flushes <= kTestScale; num_flushes++) { + // Here we treat one newly flushed file as an unit. + // + // For example, if a newly flushed file is 100k, and a compaction has + // 4 input units, then this compaction inputs 400k. + uint32_t num_input_units = GetUniversalCompactionInputUnits(num_flushes); + if (num_input_units == 0) { + continue; + } + // The following statement determines the expected smallest key + // based on whether it is a full compaction. A full compaction only + // happens when the number of flushes equals to the number of compaction + // input runs. + uint64_t smallest_key = + (num_flushes == num_input_units) ? + key_base : key_base * (num_flushes - 1); + + stats_checker->AddExpectedStats( + NewManualCompactionJobStats( + Key(smallest_key, 10), + Key(smallest_key + key_base * num_input_units - key_interval, 10), + num_input_units, + num_input_units > 2 ? num_input_units / 2 : 0, + num_keys_per_table * num_input_units, + kKeySize, kValueSize, + num_input_units, + num_keys_per_table * num_input_units, + 1.0, 0, false)); + dbfull()->TEST_WaitForCompact(); + } + ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 3U); + + for (uint64_t start_key = key_base; + start_key <= key_base * kTestScale; + start_key += key_base) { + MakeTableWithKeyValues( + &rnd, start_key, start_key + key_base - 1, + kKeySize, kValueSize, key_interval, + compression_ratio, 1); + reinterpret_cast(db_)->TEST_WaitForCompact(); + } + ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 0U); +} + +INSTANTIATE_TEST_CASE_P(CompactionJobStatsTest, CompactionJobStatsTest, + ::testing::Values(1, 4)); +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "SKIPPED, not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // !ROCKSDB_LITE + +#else + +int main(int /*argc*/, char** /*argv*/) { return 0; } +#endif // !defined(IOS_CROSS_COMPILE) diff --git a/src/rocksdb/db/compaction/compaction_job_test.cc b/src/rocksdb/db/compaction/compaction_job_test.cc new file mode 100644 index 000000000..e7b46ef97 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_job_test.cc @@ -0,0 +1,1082 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include +#include +#include + +#include "db/blob_index.h" +#include "db/column_family.h" +#include "db/compaction/compaction_job.h" +#include "db/db_impl/db_impl.h" +#include "db/error_handler.h" +#include "db/version_set.h" +#include "file/writable_file_writer.h" +#include "rocksdb/cache.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/write_buffer_manager.h" +#include "table/mock_table.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/string_util.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +void VerifyInitializationOfCompactionJobStats( + const CompactionJobStats& compaction_job_stats) { +#if !defined(IOS_CROSS_COMPILE) + ASSERT_EQ(compaction_job_stats.elapsed_micros, 0U); + + ASSERT_EQ(compaction_job_stats.num_input_records, 0U); + ASSERT_EQ(compaction_job_stats.num_input_files, 0U); + ASSERT_EQ(compaction_job_stats.num_input_files_at_output_level, 0U); + + ASSERT_EQ(compaction_job_stats.num_output_records, 0U); + ASSERT_EQ(compaction_job_stats.num_output_files, 0U); + + ASSERT_EQ(compaction_job_stats.is_manual_compaction, true); + + ASSERT_EQ(compaction_job_stats.total_input_bytes, 0U); + ASSERT_EQ(compaction_job_stats.total_output_bytes, 0U); + + ASSERT_EQ(compaction_job_stats.total_input_raw_key_bytes, 0U); + ASSERT_EQ(compaction_job_stats.total_input_raw_value_bytes, 0U); + + ASSERT_EQ(compaction_job_stats.smallest_output_key_prefix[0], 0); + ASSERT_EQ(compaction_job_stats.largest_output_key_prefix[0], 0); + + ASSERT_EQ(compaction_job_stats.num_records_replaced, 0U); + + ASSERT_EQ(compaction_job_stats.num_input_deletion_records, 0U); + ASSERT_EQ(compaction_job_stats.num_expired_deletion_records, 0U); + + ASSERT_EQ(compaction_job_stats.num_corrupt_keys, 0U); +#endif // !defined(IOS_CROSS_COMPILE) +} + +} // namespace + +// TODO(icanadi) Make it simpler once we mock out VersionSet +class CompactionJobTest : public testing::Test { + public: + CompactionJobTest() + : env_(Env::Default()), + fs_(std::make_shared(env_)), + dbname_(test::PerThreadDBPath("compaction_job_test")), + db_options_(), + mutable_cf_options_(cf_options_), + table_cache_(NewLRUCache(50000, 16)), + write_buffer_manager_(db_options_.db_write_buffer_size), + versions_(new VersionSet(dbname_, &db_options_, env_options_, + table_cache_.get(), &write_buffer_manager_, + &write_controller_, + /*block_cache_tracer=*/nullptr)), + shutting_down_(false), + preserve_deletes_seqnum_(0), + mock_table_factory_(new mock::MockTableFactory()), + error_handler_(nullptr, db_options_, &mutex_) { + EXPECT_OK(env_->CreateDirIfMissing(dbname_)); + db_options_.env = env_; + db_options_.fs = fs_; + db_options_.db_paths.emplace_back(dbname_, + std::numeric_limits::max()); + } + + std::string GenerateFileName(uint64_t file_number) { + FileMetaData meta; + std::vector db_paths; + db_paths.emplace_back(dbname_, std::numeric_limits::max()); + meta.fd = FileDescriptor(file_number, 0, 0); + return TableFileName(db_paths, meta.fd.GetNumber(), meta.fd.GetPathId()); + } + + static std::string KeyStr(const std::string& user_key, + const SequenceNumber seq_num, const ValueType t) { + return InternalKey(user_key, seq_num, t).Encode().ToString(); + } + + static std::string BlobStr(uint64_t blob_file_number, uint64_t offset, + uint64_t size) { + std::string blob_index; + BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size, + kNoCompression); + return blob_index; + } + + static std::string BlobStrTTL(uint64_t blob_file_number, uint64_t offset, + uint64_t size, uint64_t expiration) { + std::string blob_index; + BlobIndex::EncodeBlobTTL(&blob_index, expiration, blob_file_number, offset, + size, kNoCompression); + return blob_index; + } + + static std::string BlobStrInlinedTTL(const Slice& value, + uint64_t expiration) { + std::string blob_index; + BlobIndex::EncodeInlinedTTL(&blob_index, expiration, value); + return blob_index; + } + + void AddMockFile(const stl_wrappers::KVMap& contents, int level = 0) { + assert(contents.size() > 0); + + bool first_key = true; + std::string smallest, largest; + InternalKey smallest_key, largest_key; + SequenceNumber smallest_seqno = kMaxSequenceNumber; + SequenceNumber largest_seqno = 0; + uint64_t oldest_blob_file_number = kInvalidBlobFileNumber; + for (auto kv : contents) { + ParsedInternalKey key; + std::string skey; + std::string value; + std::tie(skey, value) = kv; + bool parsed = ParseInternalKey(skey, &key); + + smallest_seqno = std::min(smallest_seqno, key.sequence); + largest_seqno = std::max(largest_seqno, key.sequence); + + if (first_key || + cfd_->user_comparator()->Compare(key.user_key, smallest) < 0) { + smallest.assign(key.user_key.data(), key.user_key.size()); + smallest_key.DecodeFrom(skey); + } + if (first_key || + cfd_->user_comparator()->Compare(key.user_key, largest) > 0) { + largest.assign(key.user_key.data(), key.user_key.size()); + largest_key.DecodeFrom(skey); + } + + first_key = false; + + if (parsed && key.type == kTypeBlobIndex) { + BlobIndex blob_index; + const Status s = blob_index.DecodeFrom(value); + if (!s.ok()) { + continue; + } + + if (blob_index.IsInlined() || blob_index.HasTTL() || + blob_index.file_number() == kInvalidBlobFileNumber) { + continue; + } + + if (oldest_blob_file_number == kInvalidBlobFileNumber || + oldest_blob_file_number > blob_index.file_number()) { + oldest_blob_file_number = blob_index.file_number(); + } + } + } + + uint64_t file_number = versions_->NewFileNumber(); + EXPECT_OK(mock_table_factory_->CreateMockTable( + env_, GenerateFileName(file_number), std::move(contents))); + + VersionEdit edit; + edit.AddFile(level, file_number, 0, 10, smallest_key, largest_key, + smallest_seqno, largest_seqno, false, oldest_blob_file_number, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName); + + mutex_.Lock(); + versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), + mutable_cf_options_, &edit, &mutex_); + mutex_.Unlock(); + } + + void SetLastSequence(const SequenceNumber sequence_number) { + versions_->SetLastAllocatedSequence(sequence_number + 1); + versions_->SetLastPublishedSequence(sequence_number + 1); + versions_->SetLastSequence(sequence_number + 1); + } + + // returns expected result after compaction + stl_wrappers::KVMap CreateTwoFiles(bool gen_corrupted_keys) { + auto expected_results = mock::MakeMockFile(); + const int kKeysPerFile = 10000; + const int kCorruptKeysPerFile = 200; + const int kMatchingKeys = kKeysPerFile / 2; + SequenceNumber sequence_number = 0; + + auto corrupt_id = [&](int id) { + return gen_corrupted_keys && id > 0 && id <= kCorruptKeysPerFile; + }; + + for (int i = 0; i < 2; ++i) { + auto contents = mock::MakeMockFile(); + for (int k = 0; k < kKeysPerFile; ++k) { + auto key = ToString(i * kMatchingKeys + k); + auto value = ToString(i * kKeysPerFile + k); + InternalKey internal_key(key, ++sequence_number, kTypeValue); + + // This is how the key will look like once it's written in bottommost + // file + InternalKey bottommost_internal_key( + key, 0, kTypeValue); + + if (corrupt_id(k)) { + test::CorruptKeyType(&internal_key); + test::CorruptKeyType(&bottommost_internal_key); + } + contents.insert({ internal_key.Encode().ToString(), value }); + if (i == 1 || k < kMatchingKeys || corrupt_id(k - kMatchingKeys)) { + expected_results.insert( + { bottommost_internal_key.Encode().ToString(), value }); + } + } + + AddMockFile(contents); + } + + SetLastSequence(sequence_number); + + return expected_results; + } + + void NewDB() { + DestroyDB(dbname_, Options()); + EXPECT_OK(env_->CreateDirIfMissing(dbname_)); + versions_.reset(new VersionSet(dbname_, &db_options_, env_options_, + table_cache_.get(), &write_buffer_manager_, + &write_controller_, + /*block_cache_tracer=*/nullptr)); + compaction_job_stats_.Reset(); + SetIdentityFile(env_, dbname_); + + VersionEdit new_db; + if (db_options_.write_dbid_to_manifest) { + DBImpl* impl = new DBImpl(DBOptions(), dbname_); + std::string db_id; + impl->GetDbIdentityFromIdentityFile(&db_id); + new_db.SetDBId(db_id); + } + new_db.SetLogNumber(0); + new_db.SetNextFile(2); + new_db.SetLastSequence(0); + + const std::string manifest = DescriptorFileName(dbname_, 1); + std::unique_ptr file; + Status s = env_->NewWritableFile( + manifest, &file, env_->OptimizeForManifestWrite(env_options_)); + ASSERT_OK(s); + std::unique_ptr file_writer(new WritableFileWriter( + NewLegacyWritableFileWrapper(std::move(file)), manifest, env_options_)); + { + log::Writer log(std::move(file_writer), 0, false); + std::string record; + new_db.EncodeTo(&record); + s = log.AddRecord(record); + } + ASSERT_OK(s); + // Make "CURRENT" file that points to the new manifest file. + s = SetCurrentFile(env_, dbname_, 1, nullptr); + + std::vector column_families; + cf_options_.table_factory = mock_table_factory_; + cf_options_.merge_operator = merge_op_; + cf_options_.compaction_filter = compaction_filter_.get(); + column_families.emplace_back(kDefaultColumnFamilyName, cf_options_); + + EXPECT_OK(versions_->Recover(column_families, false)); + cfd_ = versions_->GetColumnFamilySet()->GetDefault(); + } + + void RunCompaction( + const std::vector>& input_files, + const stl_wrappers::KVMap& expected_results, + const std::vector& snapshots = {}, + SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber, + int output_level = 1, bool verify = true, + uint64_t expected_oldest_blob_file_number = kInvalidBlobFileNumber) { + auto cfd = versions_->GetColumnFamilySet()->GetDefault(); + + size_t num_input_files = 0; + std::vector compaction_input_files; + for (size_t level = 0; level < input_files.size(); level++) { + auto level_files = input_files[level]; + CompactionInputFiles compaction_level; + compaction_level.level = static_cast(level); + compaction_level.files.insert(compaction_level.files.end(), + level_files.begin(), level_files.end()); + compaction_input_files.push_back(compaction_level); + num_input_files += level_files.size(); + } + + Compaction compaction(cfd->current()->storage_info(), *cfd->ioptions(), + *cfd->GetLatestMutableCFOptions(), + compaction_input_files, output_level, 1024 * 1024, + 10 * 1024 * 1024, 0, kNoCompression, + cfd->ioptions()->compression_opts, 0, {}, true); + compaction.SetInputVersion(cfd->current()); + + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get()); + mutex_.Lock(); + EventLogger event_logger(db_options_.info_log.get()); + // TODO(yiwu) add a mock snapshot checker and add test for it. + SnapshotChecker* snapshot_checker = nullptr; + CompactionJob compaction_job( + 0, &compaction, db_options_, env_options_, versions_.get(), + &shutting_down_, preserve_deletes_seqnum_, &log_buffer, nullptr, + nullptr, nullptr, &mutex_, &error_handler_, snapshots, + earliest_write_conflict_snapshot, snapshot_checker, table_cache_, + &event_logger, false, false, dbname_, &compaction_job_stats_, + Env::Priority::USER); + VerifyInitializationOfCompactionJobStats(compaction_job_stats_); + + compaction_job.Prepare(); + mutex_.Unlock(); + Status s; + s = compaction_job.Run(); + ASSERT_OK(s); + mutex_.Lock(); + ASSERT_OK(compaction_job.Install(*cfd->GetLatestMutableCFOptions())); + mutex_.Unlock(); + + if (verify) { + ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U); + ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files); + + if (expected_results.empty()) { + ASSERT_EQ(compaction_job_stats_.num_output_files, 0U); + } else { + ASSERT_EQ(compaction_job_stats_.num_output_files, 1U); + mock_table_factory_->AssertLatestFile(expected_results); + + auto output_files = + cfd->current()->storage_info()->LevelFiles(output_level); + ASSERT_EQ(output_files.size(), 1); + ASSERT_EQ(output_files[0]->oldest_blob_file_number, + expected_oldest_blob_file_number); + } + } + } + + Env* env_; + std::shared_ptr fs_; + std::string dbname_; + EnvOptions env_options_; + ImmutableDBOptions db_options_; + ColumnFamilyOptions cf_options_; + MutableCFOptions mutable_cf_options_; + std::shared_ptr table_cache_; + WriteController write_controller_; + WriteBufferManager write_buffer_manager_; + std::unique_ptr versions_; + InstrumentedMutex mutex_; + std::atomic shutting_down_; + SequenceNumber preserve_deletes_seqnum_; + std::shared_ptr mock_table_factory_; + CompactionJobStats compaction_job_stats_; + ColumnFamilyData* cfd_; + std::unique_ptr compaction_filter_; + std::shared_ptr merge_op_; + ErrorHandler error_handler_; +}; + +TEST_F(CompactionJobTest, Simple) { + NewDB(); + + auto expected_results = CreateTwoFiles(false); + auto cfd = versions_->GetColumnFamilySet()->GetDefault(); + auto files = cfd->current()->storage_info()->LevelFiles(0); + ASSERT_EQ(2U, files.size()); + RunCompaction({ files }, expected_results); +} + +TEST_F(CompactionJobTest, SimpleCorrupted) { + NewDB(); + + auto expected_results = CreateTwoFiles(true); + auto cfd = versions_->GetColumnFamilySet()->GetDefault(); + auto files = cfd->current()->storage_info()->LevelFiles(0); + RunCompaction({files}, expected_results); + ASSERT_EQ(compaction_job_stats_.num_corrupt_keys, 400U); +} + +TEST_F(CompactionJobTest, SimpleDeletion) { + NewDB(); + + auto file1 = mock::MakeMockFile({{KeyStr("c", 4U, kTypeDeletion), ""}, + {KeyStr("c", 3U, kTypeValue), "val"}}); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({{KeyStr("b", 2U, kTypeValue), "val"}, + {KeyStr("b", 1U, kTypeValue), "val"}}); + AddMockFile(file2); + + auto expected_results = + mock::MakeMockFile({{KeyStr("b", 0U, kTypeValue), "val"}}); + + SetLastSequence(4U); + auto files = cfd_->current()->storage_info()->LevelFiles(0); + RunCompaction({files}, expected_results); +} + +TEST_F(CompactionJobTest, OutputNothing) { + NewDB(); + + auto file1 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"}}); + + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({{KeyStr("a", 2U, kTypeDeletion), ""}}); + + AddMockFile(file2); + + auto expected_results = mock::MakeMockFile(); + + SetLastSequence(4U); + auto files = cfd_->current()->storage_info()->LevelFiles(0); + RunCompaction({files}, expected_results); +} + +TEST_F(CompactionJobTest, SimpleOverwrite) { + NewDB(); + + auto file1 = mock::MakeMockFile({ + {KeyStr("a", 3U, kTypeValue), "val2"}, + {KeyStr("b", 4U, kTypeValue), "val3"}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"}, + {KeyStr("b", 2U, kTypeValue), "val"}}); + AddMockFile(file2); + + auto expected_results = + mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "val2"}, + {KeyStr("b", 0U, kTypeValue), "val3"}}); + + SetLastSequence(4U); + auto files = cfd_->current()->storage_info()->LevelFiles(0); + RunCompaction({files}, expected_results); +} + +TEST_F(CompactionJobTest, SimpleNonLastLevel) { + NewDB(); + + auto file1 = mock::MakeMockFile({ + {KeyStr("a", 5U, kTypeValue), "val2"}, + {KeyStr("b", 6U, kTypeValue), "val3"}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({{KeyStr("a", 3U, kTypeValue), "val"}, + {KeyStr("b", 4U, kTypeValue), "val"}}); + AddMockFile(file2, 1); + + auto file3 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"}, + {KeyStr("b", 2U, kTypeValue), "val"}}); + AddMockFile(file3, 2); + + // Because level 1 is not the last level, the sequence numbers of a and b + // cannot be set to 0 + auto expected_results = + mock::MakeMockFile({{KeyStr("a", 5U, kTypeValue), "val2"}, + {KeyStr("b", 6U, kTypeValue), "val3"}}); + + SetLastSequence(6U); + auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0); + auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1); + RunCompaction({lvl0_files, lvl1_files}, expected_results); +} + +TEST_F(CompactionJobTest, SimpleMerge) { + merge_op_ = MergeOperators::CreateStringAppendOperator(); + NewDB(); + + auto file1 = mock::MakeMockFile({ + {KeyStr("a", 5U, kTypeMerge), "5"}, + {KeyStr("a", 4U, kTypeMerge), "4"}, + {KeyStr("a", 3U, kTypeValue), "3"}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile( + {{KeyStr("b", 2U, kTypeMerge), "2"}, {KeyStr("b", 1U, kTypeValue), "1"}}); + AddMockFile(file2); + + auto expected_results = + mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "3,4,5"}, + {KeyStr("b", 0U, kTypeValue), "1,2"}}); + + SetLastSequence(5U); + auto files = cfd_->current()->storage_info()->LevelFiles(0); + RunCompaction({files}, expected_results); +} + +TEST_F(CompactionJobTest, NonAssocMerge) { + merge_op_ = MergeOperators::CreateStringAppendTESTOperator(); + NewDB(); + + auto file1 = mock::MakeMockFile({ + {KeyStr("a", 5U, kTypeMerge), "5"}, + {KeyStr("a", 4U, kTypeMerge), "4"}, + {KeyStr("a", 3U, kTypeMerge), "3"}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile( + {{KeyStr("b", 2U, kTypeMerge), "2"}, {KeyStr("b", 1U, kTypeMerge), "1"}}); + AddMockFile(file2); + + auto expected_results = + mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "3,4,5"}, + {KeyStr("b", 0U, kTypeValue), "1,2"}}); + + SetLastSequence(5U); + auto files = cfd_->current()->storage_info()->LevelFiles(0); + RunCompaction({files}, expected_results); +} + +// Filters merge operands with value 10. +TEST_F(CompactionJobTest, MergeOperandFilter) { + merge_op_ = MergeOperators::CreateUInt64AddOperator(); + compaction_filter_.reset(new test::FilterNumber(10U)); + NewDB(); + + auto file1 = mock::MakeMockFile( + {{KeyStr("a", 5U, kTypeMerge), test::EncodeInt(5U)}, + {KeyStr("a", 4U, kTypeMerge), test::EncodeInt(10U)}, // Filtered + {KeyStr("a", 3U, kTypeMerge), test::EncodeInt(3U)}}); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({ + {KeyStr("b", 2U, kTypeMerge), test::EncodeInt(2U)}, + {KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)} // Filtered + }); + AddMockFile(file2); + + auto expected_results = + mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), test::EncodeInt(8U)}, + {KeyStr("b", 0U, kTypeValue), test::EncodeInt(2U)}}); + + SetLastSequence(5U); + auto files = cfd_->current()->storage_info()->LevelFiles(0); + RunCompaction({files}, expected_results); +} + +TEST_F(CompactionJobTest, FilterSomeMergeOperands) { + merge_op_ = MergeOperators::CreateUInt64AddOperator(); + compaction_filter_.reset(new test::FilterNumber(10U)); + NewDB(); + + auto file1 = mock::MakeMockFile( + {{KeyStr("a", 5U, kTypeMerge), test::EncodeInt(5U)}, + {KeyStr("a", 4U, kTypeMerge), test::EncodeInt(10U)}, // Filtered + {KeyStr("a", 3U, kTypeValue), test::EncodeInt(5U)}, + {KeyStr("d", 8U, kTypeMerge), test::EncodeInt(10U)}}); + AddMockFile(file1); + + auto file2 = + mock::MakeMockFile({{KeyStr("b", 2U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("c", 2U, kTypeMerge), test::EncodeInt(3U)}, + {KeyStr("c", 1U, kTypeValue), test::EncodeInt(7U)}, + {KeyStr("d", 1U, kTypeValue), test::EncodeInt(6U)}}); + AddMockFile(file2); + + auto file3 = + mock::MakeMockFile({{KeyStr("a", 1U, kTypeMerge), test::EncodeInt(3U)}}); + AddMockFile(file3, 2); + + auto expected_results = mock::MakeMockFile({ + {KeyStr("a", 5U, kTypeValue), test::EncodeInt(10U)}, + {KeyStr("c", 2U, kTypeValue), test::EncodeInt(10U)}, + {KeyStr("d", 1U, kTypeValue), test::EncodeInt(6U)} + // b does not appear because the operands are filtered + }); + + SetLastSequence(5U); + auto files = cfd_->current()->storage_info()->LevelFiles(0); + RunCompaction({files}, expected_results); +} + +// Test where all operands/merge results are filtered out. +TEST_F(CompactionJobTest, FilterAllMergeOperands) { + merge_op_ = MergeOperators::CreateUInt64AddOperator(); + compaction_filter_.reset(new test::FilterNumber(10U)); + NewDB(); + + auto file1 = + mock::MakeMockFile({{KeyStr("a", 11U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("a", 10U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("a", 9U, kTypeMerge), test::EncodeInt(10U)}}); + AddMockFile(file1); + + auto file2 = + mock::MakeMockFile({{KeyStr("b", 8U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("b", 7U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("b", 6U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("b", 5U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("b", 4U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("b", 3U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("b", 2U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("c", 2U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("c", 1U, kTypeMerge), test::EncodeInt(10U)}}); + AddMockFile(file2); + + auto file3 = + mock::MakeMockFile({{KeyStr("a", 2U, kTypeMerge), test::EncodeInt(10U)}, + {KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)}}); + AddMockFile(file3, 2); + + SetLastSequence(11U); + auto files = cfd_->current()->storage_info()->LevelFiles(0); + + stl_wrappers::KVMap empty_map; + RunCompaction({files}, empty_map); +} + +TEST_F(CompactionJobTest, SimpleSingleDelete) { + NewDB(); + + auto file1 = mock::MakeMockFile({ + {KeyStr("a", 5U, kTypeDeletion), ""}, + {KeyStr("b", 6U, kTypeSingleDeletion), ""}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({{KeyStr("a", 3U, kTypeValue), "val"}, + {KeyStr("b", 4U, kTypeValue), "val"}}); + AddMockFile(file2); + + auto file3 = mock::MakeMockFile({ + {KeyStr("a", 1U, kTypeValue), "val"}, + }); + AddMockFile(file3, 2); + + auto expected_results = + mock::MakeMockFile({{KeyStr("a", 5U, kTypeDeletion), ""}}); + + SetLastSequence(6U); + auto files = cfd_->current()->storage_info()->LevelFiles(0); + RunCompaction({files}, expected_results); +} + +TEST_F(CompactionJobTest, SingleDeleteSnapshots) { + NewDB(); + + auto file1 = mock::MakeMockFile({ + {KeyStr("A", 12U, kTypeSingleDeletion), ""}, + {KeyStr("a", 12U, kTypeSingleDeletion), ""}, + {KeyStr("b", 21U, kTypeSingleDeletion), ""}, + {KeyStr("c", 22U, kTypeSingleDeletion), ""}, + {KeyStr("d", 9U, kTypeSingleDeletion), ""}, + {KeyStr("f", 21U, kTypeSingleDeletion), ""}, + {KeyStr("j", 11U, kTypeSingleDeletion), ""}, + {KeyStr("j", 9U, kTypeSingleDeletion), ""}, + {KeyStr("k", 12U, kTypeSingleDeletion), ""}, + {KeyStr("k", 11U, kTypeSingleDeletion), ""}, + {KeyStr("l", 3U, kTypeSingleDeletion), ""}, + {KeyStr("l", 2U, kTypeSingleDeletion), ""}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({ + {KeyStr("0", 2U, kTypeSingleDeletion), ""}, + {KeyStr("a", 11U, kTypeValue), "val1"}, + {KeyStr("b", 11U, kTypeValue), "val2"}, + {KeyStr("c", 21U, kTypeValue), "val3"}, + {KeyStr("d", 8U, kTypeValue), "val4"}, + {KeyStr("e", 2U, kTypeSingleDeletion), ""}, + {KeyStr("f", 1U, kTypeValue), "val1"}, + {KeyStr("g", 11U, kTypeSingleDeletion), ""}, + {KeyStr("h", 2U, kTypeSingleDeletion), ""}, + {KeyStr("m", 12U, kTypeValue), "val1"}, + {KeyStr("m", 11U, kTypeSingleDeletion), ""}, + {KeyStr("m", 8U, kTypeValue), "val2"}, + }); + AddMockFile(file2); + + auto file3 = mock::MakeMockFile({ + {KeyStr("A", 1U, kTypeValue), "val"}, + {KeyStr("e", 1U, kTypeValue), "val"}, + }); + AddMockFile(file3, 2); + + auto expected_results = mock::MakeMockFile({ + {KeyStr("A", 12U, kTypeSingleDeletion), ""}, + {KeyStr("a", 12U, kTypeSingleDeletion), ""}, + {KeyStr("a", 11U, kTypeValue), ""}, + {KeyStr("b", 21U, kTypeSingleDeletion), ""}, + {KeyStr("b", 11U, kTypeValue), "val2"}, + {KeyStr("c", 22U, kTypeSingleDeletion), ""}, + {KeyStr("c", 21U, kTypeValue), ""}, + {KeyStr("e", 2U, kTypeSingleDeletion), ""}, + {KeyStr("f", 21U, kTypeSingleDeletion), ""}, + {KeyStr("f", 1U, kTypeValue), "val1"}, + {KeyStr("g", 11U, kTypeSingleDeletion), ""}, + {KeyStr("j", 11U, kTypeSingleDeletion), ""}, + {KeyStr("k", 11U, kTypeSingleDeletion), ""}, + {KeyStr("m", 12U, kTypeValue), "val1"}, + {KeyStr("m", 11U, kTypeSingleDeletion), ""}, + {KeyStr("m", 8U, kTypeValue), "val2"}, + }); + + SetLastSequence(22U); + auto files = cfd_->current()->storage_info()->LevelFiles(0); + RunCompaction({files}, expected_results, {10U, 20U}, 10U); +} + +TEST_F(CompactionJobTest, EarliestWriteConflictSnapshot) { + NewDB(); + + // Test multiple snapshots where the earliest snapshot is not a + // write-conflic-snapshot. + + auto file1 = mock::MakeMockFile({ + {KeyStr("A", 24U, kTypeSingleDeletion), ""}, + {KeyStr("A", 23U, kTypeValue), "val"}, + {KeyStr("B", 24U, kTypeSingleDeletion), ""}, + {KeyStr("B", 23U, kTypeValue), "val"}, + {KeyStr("D", 24U, kTypeSingleDeletion), ""}, + {KeyStr("G", 32U, kTypeSingleDeletion), ""}, + {KeyStr("G", 31U, kTypeValue), "val"}, + {KeyStr("G", 24U, kTypeSingleDeletion), ""}, + {KeyStr("G", 23U, kTypeValue), "val2"}, + {KeyStr("H", 31U, kTypeValue), "val"}, + {KeyStr("H", 24U, kTypeSingleDeletion), ""}, + {KeyStr("H", 23U, kTypeValue), "val"}, + {KeyStr("I", 35U, kTypeSingleDeletion), ""}, + {KeyStr("I", 34U, kTypeValue), "val2"}, + {KeyStr("I", 33U, kTypeSingleDeletion), ""}, + {KeyStr("I", 32U, kTypeValue), "val3"}, + {KeyStr("I", 31U, kTypeSingleDeletion), ""}, + {KeyStr("J", 34U, kTypeValue), "val"}, + {KeyStr("J", 33U, kTypeSingleDeletion), ""}, + {KeyStr("J", 25U, kTypeValue), "val2"}, + {KeyStr("J", 24U, kTypeSingleDeletion), ""}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({ + {KeyStr("A", 14U, kTypeSingleDeletion), ""}, + {KeyStr("A", 13U, kTypeValue), "val2"}, + {KeyStr("C", 14U, kTypeSingleDeletion), ""}, + {KeyStr("C", 13U, kTypeValue), "val"}, + {KeyStr("E", 12U, kTypeSingleDeletion), ""}, + {KeyStr("F", 4U, kTypeSingleDeletion), ""}, + {KeyStr("F", 3U, kTypeValue), "val"}, + {KeyStr("G", 14U, kTypeSingleDeletion), ""}, + {KeyStr("G", 13U, kTypeValue), "val3"}, + {KeyStr("H", 14U, kTypeSingleDeletion), ""}, + {KeyStr("H", 13U, kTypeValue), "val2"}, + {KeyStr("I", 13U, kTypeValue), "val4"}, + {KeyStr("I", 12U, kTypeSingleDeletion), ""}, + {KeyStr("I", 11U, kTypeValue), "val5"}, + {KeyStr("J", 15U, kTypeValue), "val3"}, + {KeyStr("J", 14U, kTypeSingleDeletion), ""}, + }); + AddMockFile(file2); + + auto expected_results = mock::MakeMockFile({ + {KeyStr("A", 24U, kTypeSingleDeletion), ""}, + {KeyStr("A", 23U, kTypeValue), ""}, + {KeyStr("B", 24U, kTypeSingleDeletion), ""}, + {KeyStr("B", 23U, kTypeValue), ""}, + {KeyStr("D", 24U, kTypeSingleDeletion), ""}, + {KeyStr("E", 12U, kTypeSingleDeletion), ""}, + {KeyStr("G", 32U, kTypeSingleDeletion), ""}, + {KeyStr("G", 31U, kTypeValue), ""}, + {KeyStr("H", 31U, kTypeValue), "val"}, + {KeyStr("I", 35U, kTypeSingleDeletion), ""}, + {KeyStr("I", 34U, kTypeValue), ""}, + {KeyStr("I", 31U, kTypeSingleDeletion), ""}, + {KeyStr("I", 13U, kTypeValue), "val4"}, + {KeyStr("J", 34U, kTypeValue), "val"}, + {KeyStr("J", 33U, kTypeSingleDeletion), ""}, + {KeyStr("J", 25U, kTypeValue), "val2"}, + {KeyStr("J", 24U, kTypeSingleDeletion), ""}, + {KeyStr("J", 15U, kTypeValue), "val3"}, + {KeyStr("J", 14U, kTypeSingleDeletion), ""}, + }); + + SetLastSequence(24U); + auto files = cfd_->current()->storage_info()->LevelFiles(0); + RunCompaction({files}, expected_results, {10U, 20U, 30U}, 20U); +} + +TEST_F(CompactionJobTest, SingleDeleteZeroSeq) { + NewDB(); + + auto file1 = mock::MakeMockFile({ + {KeyStr("A", 10U, kTypeSingleDeletion), ""}, + {KeyStr("dummy", 5U, kTypeValue), "val2"}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({ + {KeyStr("A", 0U, kTypeValue), "val"}, + }); + AddMockFile(file2); + + auto expected_results = mock::MakeMockFile({ + {KeyStr("dummy", 0U, kTypeValue), "val2"}, + }); + + SetLastSequence(22U); + auto files = cfd_->current()->storage_info()->LevelFiles(0); + RunCompaction({files}, expected_results, {}); +} + +TEST_F(CompactionJobTest, MultiSingleDelete) { + // Tests three scenarios involving multiple single delete/put pairs: + // + // A: Put Snapshot SDel Put SDel -> Put Snapshot SDel + // B: Snapshot Put SDel Put SDel Snapshot -> Snapshot SDel Snapshot + // C: SDel Put SDel Snapshot Put -> Snapshot Put + // D: (Put) SDel Snapshot Put SDel -> (Put) SDel Snapshot SDel + // E: Put SDel Snapshot Put SDel -> Snapshot SDel + // F: Put SDel Put Sdel Snapshot -> removed + // G: Snapshot SDel Put SDel Put -> Snapshot Put SDel + // H: (Put) Put SDel Put Sdel Snapshot -> Removed + // I: (Put) Snapshot Put SDel Put SDel -> SDel + // J: Put Put SDel Put SDel SDel Snapshot Put Put SDel SDel Put + // -> Snapshot Put + // K: SDel SDel Put SDel Put Put Snapshot SDel Put SDel SDel Put SDel + // -> Snapshot Put Snapshot SDel + // L: SDel Put Del Put SDel Snapshot Del Put Del SDel Put SDel + // -> Snapshot SDel + // M: (Put) SDel Put Del Put SDel Snapshot Put Del SDel Put SDel Del + // -> SDel Snapshot Del + NewDB(); + + auto file1 = mock::MakeMockFile({ + {KeyStr("A", 14U, kTypeSingleDeletion), ""}, + {KeyStr("A", 13U, kTypeValue), "val5"}, + {KeyStr("A", 12U, kTypeSingleDeletion), ""}, + {KeyStr("B", 14U, kTypeSingleDeletion), ""}, + {KeyStr("B", 13U, kTypeValue), "val2"}, + {KeyStr("C", 14U, kTypeValue), "val3"}, + {KeyStr("D", 12U, kTypeSingleDeletion), ""}, + {KeyStr("D", 11U, kTypeValue), "val4"}, + {KeyStr("G", 15U, kTypeValue), "val"}, + {KeyStr("G", 14U, kTypeSingleDeletion), ""}, + {KeyStr("G", 13U, kTypeValue), "val"}, + {KeyStr("I", 14U, kTypeSingleDeletion), ""}, + {KeyStr("I", 13U, kTypeValue), "val"}, + {KeyStr("J", 15U, kTypeValue), "val"}, + {KeyStr("J", 14U, kTypeSingleDeletion), ""}, + {KeyStr("J", 13U, kTypeSingleDeletion), ""}, + {KeyStr("J", 12U, kTypeValue), "val"}, + {KeyStr("J", 11U, kTypeValue), "val"}, + {KeyStr("K", 16U, kTypeSingleDeletion), ""}, + {KeyStr("K", 15U, kTypeValue), "val1"}, + {KeyStr("K", 14U, kTypeSingleDeletion), ""}, + {KeyStr("K", 13U, kTypeSingleDeletion), ""}, + {KeyStr("K", 12U, kTypeValue), "val2"}, + {KeyStr("K", 11U, kTypeSingleDeletion), ""}, + {KeyStr("L", 16U, kTypeSingleDeletion), ""}, + {KeyStr("L", 15U, kTypeValue), "val"}, + {KeyStr("L", 14U, kTypeSingleDeletion), ""}, + {KeyStr("L", 13U, kTypeDeletion), ""}, + {KeyStr("L", 12U, kTypeValue), "val"}, + {KeyStr("L", 11U, kTypeDeletion), ""}, + {KeyStr("M", 16U, kTypeDeletion), ""}, + {KeyStr("M", 15U, kTypeSingleDeletion), ""}, + {KeyStr("M", 14U, kTypeValue), "val"}, + {KeyStr("M", 13U, kTypeSingleDeletion), ""}, + {KeyStr("M", 12U, kTypeDeletion), ""}, + {KeyStr("M", 11U, kTypeValue), "val"}, + }); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile({ + {KeyStr("A", 10U, kTypeValue), "val"}, + {KeyStr("B", 12U, kTypeSingleDeletion), ""}, + {KeyStr("B", 11U, kTypeValue), "val2"}, + {KeyStr("C", 10U, kTypeSingleDeletion), ""}, + {KeyStr("C", 9U, kTypeValue), "val6"}, + {KeyStr("C", 8U, kTypeSingleDeletion), ""}, + {KeyStr("D", 10U, kTypeSingleDeletion), ""}, + {KeyStr("E", 12U, kTypeSingleDeletion), ""}, + {KeyStr("E", 11U, kTypeValue), "val"}, + {KeyStr("E", 5U, kTypeSingleDeletion), ""}, + {KeyStr("E", 4U, kTypeValue), "val"}, + {KeyStr("F", 6U, kTypeSingleDeletion), ""}, + {KeyStr("F", 5U, kTypeValue), "val"}, + {KeyStr("F", 4U, kTypeSingleDeletion), ""}, + {KeyStr("F", 3U, kTypeValue), "val"}, + {KeyStr("G", 12U, kTypeSingleDeletion), ""}, + {KeyStr("H", 6U, kTypeSingleDeletion), ""}, + {KeyStr("H", 5U, kTypeValue), "val"}, + {KeyStr("H", 4U, kTypeSingleDeletion), ""}, + {KeyStr("H", 3U, kTypeValue), "val"}, + {KeyStr("I", 12U, kTypeSingleDeletion), ""}, + {KeyStr("I", 11U, kTypeValue), "val"}, + {KeyStr("J", 6U, kTypeSingleDeletion), ""}, + {KeyStr("J", 5U, kTypeSingleDeletion), ""}, + {KeyStr("J", 4U, kTypeValue), "val"}, + {KeyStr("J", 3U, kTypeSingleDeletion), ""}, + {KeyStr("J", 2U, kTypeValue), "val"}, + {KeyStr("K", 8U, kTypeValue), "val3"}, + {KeyStr("K", 7U, kTypeValue), "val4"}, + {KeyStr("K", 6U, kTypeSingleDeletion), ""}, + {KeyStr("K", 5U, kTypeValue), "val5"}, + {KeyStr("K", 2U, kTypeSingleDeletion), ""}, + {KeyStr("K", 1U, kTypeSingleDeletion), ""}, + {KeyStr("L", 5U, kTypeSingleDeletion), ""}, + {KeyStr("L", 4U, kTypeValue), "val"}, + {KeyStr("L", 3U, kTypeDeletion), ""}, + {KeyStr("L", 2U, kTypeValue), "val"}, + {KeyStr("L", 1U, kTypeSingleDeletion), ""}, + {KeyStr("M", 10U, kTypeSingleDeletion), ""}, + {KeyStr("M", 7U, kTypeValue), "val"}, + {KeyStr("M", 5U, kTypeDeletion), ""}, + {KeyStr("M", 4U, kTypeValue), "val"}, + {KeyStr("M", 3U, kTypeSingleDeletion), ""}, + }); + AddMockFile(file2); + + auto file3 = mock::MakeMockFile({ + {KeyStr("D", 1U, kTypeValue), "val"}, + {KeyStr("H", 1U, kTypeValue), "val"}, + {KeyStr("I", 2U, kTypeValue), "val"}, + }); + AddMockFile(file3, 2); + + auto file4 = mock::MakeMockFile({ + {KeyStr("M", 1U, kTypeValue), "val"}, + }); + AddMockFile(file4, 2); + + auto expected_results = + mock::MakeMockFile({{KeyStr("A", 14U, kTypeSingleDeletion), ""}, + {KeyStr("A", 13U, kTypeValue), ""}, + {KeyStr("A", 12U, kTypeSingleDeletion), ""}, + {KeyStr("A", 10U, kTypeValue), "val"}, + {KeyStr("B", 14U, kTypeSingleDeletion), ""}, + {KeyStr("B", 13U, kTypeValue), ""}, + {KeyStr("C", 14U, kTypeValue), "val3"}, + {KeyStr("D", 12U, kTypeSingleDeletion), ""}, + {KeyStr("D", 11U, kTypeValue), ""}, + {KeyStr("D", 10U, kTypeSingleDeletion), ""}, + {KeyStr("E", 12U, kTypeSingleDeletion), ""}, + {KeyStr("E", 11U, kTypeValue), ""}, + {KeyStr("G", 15U, kTypeValue), "val"}, + {KeyStr("G", 12U, kTypeSingleDeletion), ""}, + {KeyStr("I", 14U, kTypeSingleDeletion), ""}, + {KeyStr("I", 13U, kTypeValue), ""}, + {KeyStr("J", 15U, kTypeValue), "val"}, + {KeyStr("K", 16U, kTypeSingleDeletion), ""}, + {KeyStr("K", 15U, kTypeValue), ""}, + {KeyStr("K", 11U, kTypeSingleDeletion), ""}, + {KeyStr("K", 8U, kTypeValue), "val3"}, + {KeyStr("L", 16U, kTypeSingleDeletion), ""}, + {KeyStr("L", 15U, kTypeValue), ""}, + {KeyStr("M", 16U, kTypeDeletion), ""}, + {KeyStr("M", 3U, kTypeSingleDeletion), ""}}); + + SetLastSequence(22U); + auto files = cfd_->current()->storage_info()->LevelFiles(0); + RunCompaction({files}, expected_results, {10U}, 10U); +} + +// This test documents the behavior where a corrupt key follows a deletion or a +// single deletion and the (single) deletion gets removed while the corrupt key +// gets written out. TODO(noetzli): We probably want a better way to treat +// corrupt keys. +TEST_F(CompactionJobTest, CorruptionAfterDeletion) { + NewDB(); + + auto file1 = + mock::MakeMockFile({{test::KeyStr("A", 6U, kTypeValue), "val3"}, + {test::KeyStr("a", 5U, kTypeDeletion), ""}, + {test::KeyStr("a", 4U, kTypeValue, true), "val"}}); + AddMockFile(file1); + + auto file2 = + mock::MakeMockFile({{test::KeyStr("b", 3U, kTypeSingleDeletion), ""}, + {test::KeyStr("b", 2U, kTypeValue, true), "val"}, + {test::KeyStr("c", 1U, kTypeValue), "val2"}}); + AddMockFile(file2); + + auto expected_results = + mock::MakeMockFile({{test::KeyStr("A", 0U, kTypeValue), "val3"}, + {test::KeyStr("a", 0U, kTypeValue, true), "val"}, + {test::KeyStr("b", 0U, kTypeValue, true), "val"}, + {test::KeyStr("c", 0U, kTypeValue), "val2"}}); + + SetLastSequence(6U); + auto files = cfd_->current()->storage_info()->LevelFiles(0); + RunCompaction({files}, expected_results); +} + +TEST_F(CompactionJobTest, OldestBlobFileNumber) { + NewDB(); + + // Note: blob1 is inlined TTL, so it will not be considered for the purposes + // of identifying the oldest referenced blob file. Similarly, blob6 will be + // ignored because it has TTL and hence refers to a TTL blob file. + const stl_wrappers::KVMap::value_type blob1( + KeyStr("a", 1U, kTypeBlobIndex), BlobStrInlinedTTL("foo", 1234567890ULL)); + const stl_wrappers::KVMap::value_type blob2(KeyStr("b", 2U, kTypeBlobIndex), + BlobStr(59, 123456, 999)); + const stl_wrappers::KVMap::value_type blob3(KeyStr("c", 3U, kTypeBlobIndex), + BlobStr(138, 1000, 1 << 8)); + auto file1 = mock::MakeMockFile({blob1, blob2, blob3}); + AddMockFile(file1); + + const stl_wrappers::KVMap::value_type blob4(KeyStr("d", 4U, kTypeBlobIndex), + BlobStr(199, 3 << 10, 1 << 20)); + const stl_wrappers::KVMap::value_type blob5(KeyStr("e", 5U, kTypeBlobIndex), + BlobStr(19, 6789, 333)); + const stl_wrappers::KVMap::value_type blob6( + KeyStr("f", 6U, kTypeBlobIndex), + BlobStrTTL(5, 2048, 1 << 7, 1234567890ULL)); + auto file2 = mock::MakeMockFile({blob4, blob5, blob6}); + AddMockFile(file2); + + const stl_wrappers::KVMap::value_type expected_blob1( + KeyStr("a", 0U, kTypeBlobIndex), blob1.second); + const stl_wrappers::KVMap::value_type expected_blob2( + KeyStr("b", 0U, kTypeBlobIndex), blob2.second); + const stl_wrappers::KVMap::value_type expected_blob3( + KeyStr("c", 0U, kTypeBlobIndex), blob3.second); + const stl_wrappers::KVMap::value_type expected_blob4( + KeyStr("d", 0U, kTypeBlobIndex), blob4.second); + const stl_wrappers::KVMap::value_type expected_blob5( + KeyStr("e", 0U, kTypeBlobIndex), blob5.second); + const stl_wrappers::KVMap::value_type expected_blob6( + KeyStr("f", 0U, kTypeBlobIndex), blob6.second); + auto expected_results = + mock::MakeMockFile({expected_blob1, expected_blob2, expected_blob3, + expected_blob4, expected_blob5, expected_blob6}); + + SetLastSequence(6U); + auto files = cfd_->current()->storage_info()->LevelFiles(0); + RunCompaction({files}, expected_results, std::vector(), + kMaxSequenceNumber, /* output_level */ 1, /* verify */ true, + /* expected_oldest_blob_file_number */ 19); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, + "SKIPPED as CompactionJobStats is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/db/compaction/compaction_picker.cc b/src/rocksdb/db/compaction/compaction_picker.cc new file mode 100644 index 000000000..4355d4b91 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_picker.cc @@ -0,0 +1,1131 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction_picker.h" + +#include +#include +#include +#include +#include +#include +#include "db/column_family.h" +#include "file/filename.h" +#include "logging/log_buffer.h" +#include "monitoring/statistics.h" +#include "test_util/sync_point.h" +#include "util/random.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { +uint64_t TotalCompensatedFileSize(const std::vector& files) { + uint64_t sum = 0; + for (size_t i = 0; i < files.size() && files[i]; i++) { + sum += files[i]->compensated_file_size; + } + return sum; +} +} // anonymous namespace + +bool FindIntraL0Compaction(const std::vector& level_files, + size_t min_files_to_compact, + uint64_t max_compact_bytes_per_del_file, + uint64_t max_compaction_bytes, + CompactionInputFiles* comp_inputs, + SequenceNumber earliest_mem_seqno) { + // Do not pick ingested file when there is at least one memtable not flushed + // which of seqno is overlap with the sst. + TEST_SYNC_POINT("FindIntraL0Compaction"); + size_t start = 0; + for (; start < level_files.size(); start++) { + if (level_files[start]->being_compacted) { + return false; + } + // If there is no data in memtable, the earliest sequence number would the + // largest sequence number in last memtable. + // Because all files are sorted in descending order by largest_seqno, so we + // only need to check the first one. + if (level_files[start]->fd.largest_seqno <= earliest_mem_seqno) { + break; + } + } + if (start >= level_files.size()) { + return false; + } + size_t compact_bytes = static_cast(level_files[start]->fd.file_size); + uint64_t compensated_compact_bytes = + level_files[start]->compensated_file_size; + size_t compact_bytes_per_del_file = port::kMaxSizet; + // Compaction range will be [start, limit). + size_t limit; + // Pull in files until the amount of compaction work per deleted file begins + // increasing or maximum total compaction size is reached. + size_t new_compact_bytes_per_del_file = 0; + for (limit = start + 1; limit < level_files.size(); ++limit) { + compact_bytes += static_cast(level_files[limit]->fd.file_size); + compensated_compact_bytes += level_files[limit]->compensated_file_size; + new_compact_bytes_per_del_file = compact_bytes / (limit - start); + if (level_files[limit]->being_compacted || + new_compact_bytes_per_del_file > compact_bytes_per_del_file || + compensated_compact_bytes > max_compaction_bytes) { + break; + } + compact_bytes_per_del_file = new_compact_bytes_per_del_file; + } + + if ((limit - start) >= min_files_to_compact && + compact_bytes_per_del_file < max_compact_bytes_per_del_file) { + assert(comp_inputs != nullptr); + comp_inputs->level = 0; + for (size_t i = start; i < limit; ++i) { + comp_inputs->files.push_back(level_files[i]); + } + return true; + } + return false; +} + +// Determine compression type, based on user options, level of the output +// file and whether compression is disabled. +// If enable_compression is false, then compression is always disabled no +// matter what the values of the other two parameters are. +// Otherwise, the compression type is determined based on options and level. +CompressionType GetCompressionType(const ImmutableCFOptions& ioptions, + const VersionStorageInfo* vstorage, + const MutableCFOptions& mutable_cf_options, + int level, int base_level, + const bool enable_compression) { + if (!enable_compression) { + // disable compression + return kNoCompression; + } + + // If bottommost_compression is set and we are compacting to the + // bottommost level then we should use it. + if (ioptions.bottommost_compression != kDisableCompressionOption && + level >= (vstorage->num_non_empty_levels() - 1)) { + return ioptions.bottommost_compression; + } + // If the user has specified a different compression level for each level, + // then pick the compression for that level. + if (!ioptions.compression_per_level.empty()) { + assert(level == 0 || level >= base_level); + int idx = (level == 0) ? 0 : level - base_level + 1; + + const int n = static_cast(ioptions.compression_per_level.size()) - 1; + // It is possible for level_ to be -1; in that case, we use level + // 0's compression. This occurs mostly in backwards compatibility + // situations when the builder doesn't know what level the file + // belongs to. Likewise, if level is beyond the end of the + // specified compression levels, use the last value. + return ioptions.compression_per_level[std::max(0, std::min(idx, n))]; + } else { + return mutable_cf_options.compression; + } +} + +CompressionOptions GetCompressionOptions(const ImmutableCFOptions& ioptions, + const VersionStorageInfo* vstorage, + int level, + const bool enable_compression) { + if (!enable_compression) { + return ioptions.compression_opts; + } + // If bottommost_compression is set and we are compacting to the + // bottommost level then we should use the specified compression options + // for the bottmomost_compression. + if (ioptions.bottommost_compression != kDisableCompressionOption && + level >= (vstorage->num_non_empty_levels() - 1) && + ioptions.bottommost_compression_opts.enabled) { + return ioptions.bottommost_compression_opts; + } + return ioptions.compression_opts; +} + +CompactionPicker::CompactionPicker(const ImmutableCFOptions& ioptions, + const InternalKeyComparator* icmp) + : ioptions_(ioptions), icmp_(icmp) {} + +CompactionPicker::~CompactionPicker() {} + +// Delete this compaction from the list of running compactions. +void CompactionPicker::ReleaseCompactionFiles(Compaction* c, Status status) { + UnregisterCompaction(c); + if (!status.ok()) { + c->ResetNextCompactionIndex(); + } +} + +void CompactionPicker::GetRange(const CompactionInputFiles& inputs, + InternalKey* smallest, + InternalKey* largest) const { + const int level = inputs.level; + assert(!inputs.empty()); + smallest->Clear(); + largest->Clear(); + + if (level == 0) { + for (size_t i = 0; i < inputs.size(); i++) { + FileMetaData* f = inputs[i]; + if (i == 0) { + *smallest = f->smallest; + *largest = f->largest; + } else { + if (icmp_->Compare(f->smallest, *smallest) < 0) { + *smallest = f->smallest; + } + if (icmp_->Compare(f->largest, *largest) > 0) { + *largest = f->largest; + } + } + } + } else { + *smallest = inputs[0]->smallest; + *largest = inputs[inputs.size() - 1]->largest; + } +} + +void CompactionPicker::GetRange(const CompactionInputFiles& inputs1, + const CompactionInputFiles& inputs2, + InternalKey* smallest, + InternalKey* largest) const { + assert(!inputs1.empty() || !inputs2.empty()); + if (inputs1.empty()) { + GetRange(inputs2, smallest, largest); + } else if (inputs2.empty()) { + GetRange(inputs1, smallest, largest); + } else { + InternalKey smallest1, smallest2, largest1, largest2; + GetRange(inputs1, &smallest1, &largest1); + GetRange(inputs2, &smallest2, &largest2); + *smallest = + icmp_->Compare(smallest1, smallest2) < 0 ? smallest1 : smallest2; + *largest = icmp_->Compare(largest1, largest2) < 0 ? largest2 : largest1; + } +} + +void CompactionPicker::GetRange(const std::vector& inputs, + InternalKey* smallest, + InternalKey* largest) const { + InternalKey current_smallest; + InternalKey current_largest; + bool initialized = false; + for (const auto& in : inputs) { + if (in.empty()) { + continue; + } + GetRange(in, ¤t_smallest, ¤t_largest); + if (!initialized) { + *smallest = current_smallest; + *largest = current_largest; + initialized = true; + } else { + if (icmp_->Compare(current_smallest, *smallest) < 0) { + *smallest = current_smallest; + } + if (icmp_->Compare(current_largest, *largest) > 0) { + *largest = current_largest; + } + } + } + assert(initialized); +} + +bool CompactionPicker::ExpandInputsToCleanCut(const std::string& /*cf_name*/, + VersionStorageInfo* vstorage, + CompactionInputFiles* inputs, + InternalKey** next_smallest) { + // This isn't good compaction + assert(!inputs->empty()); + + const int level = inputs->level; + // GetOverlappingInputs will always do the right thing for level-0. + // So we don't need to do any expansion if level == 0. + if (level == 0) { + return true; + } + + InternalKey smallest, largest; + + // Keep expanding inputs until we are sure that there is a "clean cut" + // boundary between the files in input and the surrounding files. + // This will ensure that no parts of a key are lost during compaction. + int hint_index = -1; + size_t old_size; + do { + old_size = inputs->size(); + GetRange(*inputs, &smallest, &largest); + inputs->clear(); + vstorage->GetOverlappingInputs(level, &smallest, &largest, &inputs->files, + hint_index, &hint_index, true, + next_smallest); + } while (inputs->size() > old_size); + + // we started off with inputs non-empty and the previous loop only grew + // inputs. thus, inputs should be non-empty here + assert(!inputs->empty()); + + // If, after the expansion, there are files that are already under + // compaction, then we must drop/cancel this compaction. + if (AreFilesInCompaction(inputs->files)) { + return false; + } + return true; +} + +bool CompactionPicker::RangeOverlapWithCompaction( + const Slice& smallest_user_key, const Slice& largest_user_key, + int level) const { + const Comparator* ucmp = icmp_->user_comparator(); + for (Compaction* c : compactions_in_progress_) { + if (c->output_level() == level && + ucmp->Compare(smallest_user_key, c->GetLargestUserKey()) <= 0 && + ucmp->Compare(largest_user_key, c->GetSmallestUserKey()) >= 0) { + // Overlap + return true; + } + } + // Did not overlap with any running compaction in level `level` + return false; +} + +bool CompactionPicker::FilesRangeOverlapWithCompaction( + const std::vector& inputs, int level) const { + bool is_empty = true; + for (auto& in : inputs) { + if (!in.empty()) { + is_empty = false; + break; + } + } + if (is_empty) { + // No files in inputs + return false; + } + + InternalKey smallest, largest; + GetRange(inputs, &smallest, &largest); + return RangeOverlapWithCompaction(smallest.user_key(), largest.user_key(), + level); +} + +// Returns true if any one of specified files are being compacted +bool CompactionPicker::AreFilesInCompaction( + const std::vector& files) { + for (size_t i = 0; i < files.size(); i++) { + if (files[i]->being_compacted) { + return true; + } + } + return false; +} + +Compaction* CompactionPicker::CompactFiles( + const CompactionOptions& compact_options, + const std::vector& input_files, int output_level, + VersionStorageInfo* vstorage, const MutableCFOptions& mutable_cf_options, + uint32_t output_path_id) { + assert(input_files.size()); + // This compaction output should not overlap with a running compaction as + // `SanitizeCompactionInputFiles` should've checked earlier and db mutex + // shouldn't have been released since. + assert(!FilesRangeOverlapWithCompaction(input_files, output_level)); + + CompressionType compression_type; + if (compact_options.compression == kDisableCompressionOption) { + int base_level; + if (ioptions_.compaction_style == kCompactionStyleLevel) { + base_level = vstorage->base_level(); + } else { + base_level = 1; + } + compression_type = + GetCompressionType(ioptions_, vstorage, mutable_cf_options, + output_level, base_level); + } else { + // TODO(ajkr): `CompactionOptions` offers configurable `CompressionType` + // without configurable `CompressionOptions`, which is inconsistent. + compression_type = compact_options.compression; + } + auto c = new Compaction( + vstorage, ioptions_, mutable_cf_options, input_files, output_level, + compact_options.output_file_size_limit, + mutable_cf_options.max_compaction_bytes, output_path_id, compression_type, + GetCompressionOptions(ioptions_, vstorage, output_level), + compact_options.max_subcompactions, + /* grandparents */ {}, true); + RegisterCompaction(c); + return c; +} + +Status CompactionPicker::GetCompactionInputsFromFileNumbers( + std::vector* input_files, + std::unordered_set* input_set, const VersionStorageInfo* vstorage, + const CompactionOptions& /*compact_options*/) const { + if (input_set->size() == 0U) { + return Status::InvalidArgument( + "Compaction must include at least one file."); + } + assert(input_files); + + std::vector matched_input_files; + matched_input_files.resize(vstorage->num_levels()); + int first_non_empty_level = -1; + int last_non_empty_level = -1; + // TODO(yhchiang): use a lazy-initialized mapping from + // file_number to FileMetaData in Version. + for (int level = 0; level < vstorage->num_levels(); ++level) { + for (auto file : vstorage->LevelFiles(level)) { + auto iter = input_set->find(file->fd.GetNumber()); + if (iter != input_set->end()) { + matched_input_files[level].files.push_back(file); + input_set->erase(iter); + last_non_empty_level = level; + if (first_non_empty_level == -1) { + first_non_empty_level = level; + } + } + } + } + + if (!input_set->empty()) { + std::string message( + "Cannot find matched SST files for the following file numbers:"); + for (auto fn : *input_set) { + message += " "; + message += ToString(fn); + } + return Status::InvalidArgument(message); + } + + for (int level = first_non_empty_level; level <= last_non_empty_level; + ++level) { + matched_input_files[level].level = level; + input_files->emplace_back(std::move(matched_input_files[level])); + } + + return Status::OK(); +} + +// Returns true if any one of the parent files are being compacted +bool CompactionPicker::IsRangeInCompaction(VersionStorageInfo* vstorage, + const InternalKey* smallest, + const InternalKey* largest, + int level, int* level_index) { + std::vector inputs; + assert(level < NumberLevels()); + + vstorage->GetOverlappingInputs(level, smallest, largest, &inputs, + level_index ? *level_index : 0, level_index); + return AreFilesInCompaction(inputs); +} + +// Populates the set of inputs of all other levels that overlap with the +// start level. +// Now we assume all levels except start level and output level are empty. +// Will also attempt to expand "start level" if that doesn't expand +// "output level" or cause "level" to include a file for compaction that has an +// overlapping user-key with another file. +// REQUIRES: input_level and output_level are different +// REQUIRES: inputs->empty() == false +// Returns false if files on parent level are currently in compaction, which +// means that we can't compact them +bool CompactionPicker::SetupOtherInputs( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, CompactionInputFiles* inputs, + CompactionInputFiles* output_level_inputs, int* parent_index, + int base_index) { + assert(!inputs->empty()); + assert(output_level_inputs->empty()); + const int input_level = inputs->level; + const int output_level = output_level_inputs->level; + if (input_level == output_level) { + // no possibility of conflict + return true; + } + + // For now, we only support merging two levels, start level and output level. + // We need to assert other levels are empty. + for (int l = input_level + 1; l < output_level; l++) { + assert(vstorage->NumLevelFiles(l) == 0); + } + + InternalKey smallest, largest; + + // Get the range one last time. + GetRange(*inputs, &smallest, &largest); + + // Populate the set of next-level files (inputs_GetOutputLevelInputs()) to + // include in compaction + vstorage->GetOverlappingInputs(output_level, &smallest, &largest, + &output_level_inputs->files, *parent_index, + parent_index); + if (AreFilesInCompaction(output_level_inputs->files)) { + return false; + } + if (!output_level_inputs->empty()) { + if (!ExpandInputsToCleanCut(cf_name, vstorage, output_level_inputs)) { + return false; + } + } + + // See if we can further grow the number of inputs in "level" without + // changing the number of "level+1" files we pick up. We also choose NOT + // to expand if this would cause "level" to include some entries for some + // user key, while excluding other entries for the same user key. This + // can happen when one user key spans multiple files. + if (!output_level_inputs->empty()) { + const uint64_t limit = mutable_cf_options.max_compaction_bytes; + const uint64_t output_level_inputs_size = + TotalCompensatedFileSize(output_level_inputs->files); + const uint64_t inputs_size = TotalCompensatedFileSize(inputs->files); + bool expand_inputs = false; + + CompactionInputFiles expanded_inputs; + expanded_inputs.level = input_level; + // Get closed interval of output level + InternalKey all_start, all_limit; + GetRange(*inputs, *output_level_inputs, &all_start, &all_limit); + bool try_overlapping_inputs = true; + vstorage->GetOverlappingInputs(input_level, &all_start, &all_limit, + &expanded_inputs.files, base_index, nullptr); + uint64_t expanded_inputs_size = + TotalCompensatedFileSize(expanded_inputs.files); + if (!ExpandInputsToCleanCut(cf_name, vstorage, &expanded_inputs)) { + try_overlapping_inputs = false; + } + if (try_overlapping_inputs && expanded_inputs.size() > inputs->size() && + output_level_inputs_size + expanded_inputs_size < limit && + !AreFilesInCompaction(expanded_inputs.files)) { + InternalKey new_start, new_limit; + GetRange(expanded_inputs, &new_start, &new_limit); + CompactionInputFiles expanded_output_level_inputs; + expanded_output_level_inputs.level = output_level; + vstorage->GetOverlappingInputs(output_level, &new_start, &new_limit, + &expanded_output_level_inputs.files, + *parent_index, parent_index); + assert(!expanded_output_level_inputs.empty()); + if (!AreFilesInCompaction(expanded_output_level_inputs.files) && + ExpandInputsToCleanCut(cf_name, vstorage, + &expanded_output_level_inputs) && + expanded_output_level_inputs.size() == output_level_inputs->size()) { + expand_inputs = true; + } + } + if (!expand_inputs) { + vstorage->GetCleanInputsWithinInterval(input_level, &all_start, + &all_limit, &expanded_inputs.files, + base_index, nullptr); + expanded_inputs_size = TotalCompensatedFileSize(expanded_inputs.files); + if (expanded_inputs.size() > inputs->size() && + output_level_inputs_size + expanded_inputs_size < limit && + !AreFilesInCompaction(expanded_inputs.files)) { + expand_inputs = true; + } + } + if (expand_inputs) { + ROCKS_LOG_INFO(ioptions_.info_log, + "[%s] Expanding@%d %" ROCKSDB_PRIszt "+%" ROCKSDB_PRIszt + "(%" PRIu64 "+%" PRIu64 " bytes) to %" ROCKSDB_PRIszt + "+%" ROCKSDB_PRIszt " (%" PRIu64 "+%" PRIu64 " bytes)\n", + cf_name.c_str(), input_level, inputs->size(), + output_level_inputs->size(), inputs_size, + output_level_inputs_size, expanded_inputs.size(), + output_level_inputs->size(), expanded_inputs_size, + output_level_inputs_size); + inputs->files = expanded_inputs.files; + } + } + return true; +} + +void CompactionPicker::GetGrandparents( + VersionStorageInfo* vstorage, const CompactionInputFiles& inputs, + const CompactionInputFiles& output_level_inputs, + std::vector* grandparents) { + InternalKey start, limit; + GetRange(inputs, output_level_inputs, &start, &limit); + // Compute the set of grandparent files that overlap this compaction + // (parent == level+1; grandparent == level+2) + if (output_level_inputs.level + 1 < NumberLevels()) { + vstorage->GetOverlappingInputs(output_level_inputs.level + 1, &start, + &limit, grandparents); + } +} + +Compaction* CompactionPicker::CompactRange( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, int input_level, int output_level, + const CompactRangeOptions& compact_range_options, const InternalKey* begin, + const InternalKey* end, InternalKey** compaction_end, bool* manual_conflict, + uint64_t max_file_num_to_ignore) { + // CompactionPickerFIFO has its own implementation of compact range + assert(ioptions_.compaction_style != kCompactionStyleFIFO); + + if (input_level == ColumnFamilyData::kCompactAllLevels) { + assert(ioptions_.compaction_style == kCompactionStyleUniversal); + + // Universal compaction with more than one level always compacts all the + // files together to the last level. + assert(vstorage->num_levels() > 1); + // DBImpl::CompactRange() set output level to be the last level + if (ioptions_.allow_ingest_behind) { + assert(output_level == vstorage->num_levels() - 2); + } else { + assert(output_level == vstorage->num_levels() - 1); + } + // DBImpl::RunManualCompaction will make full range for universal compaction + assert(begin == nullptr); + assert(end == nullptr); + *compaction_end = nullptr; + + int start_level = 0; + for (; start_level < vstorage->num_levels() && + vstorage->NumLevelFiles(start_level) == 0; + start_level++) { + } + if (start_level == vstorage->num_levels()) { + return nullptr; + } + + if ((start_level == 0) && (!level0_compactions_in_progress_.empty())) { + *manual_conflict = true; + // Only one level 0 compaction allowed + return nullptr; + } + + std::vector inputs(vstorage->num_levels() - + start_level); + for (int level = start_level; level < vstorage->num_levels(); level++) { + inputs[level - start_level].level = level; + auto& files = inputs[level - start_level].files; + for (FileMetaData* f : vstorage->LevelFiles(level)) { + files.push_back(f); + } + if (AreFilesInCompaction(files)) { + *manual_conflict = true; + return nullptr; + } + } + + // 2 non-exclusive manual compactions could run at the same time producing + // overlaping outputs in the same level. + if (FilesRangeOverlapWithCompaction(inputs, output_level)) { + // This compaction output could potentially conflict with the output + // of a currently running compaction, we cannot run it. + *manual_conflict = true; + return nullptr; + } + + Compaction* c = new Compaction( + vstorage, ioptions_, mutable_cf_options, std::move(inputs), + output_level, + MaxFileSizeForLevel(mutable_cf_options, output_level, + ioptions_.compaction_style), + /* max_compaction_bytes */ LLONG_MAX, + compact_range_options.target_path_id, + GetCompressionType(ioptions_, vstorage, mutable_cf_options, + output_level, 1), + GetCompressionOptions(ioptions_, vstorage, output_level), + compact_range_options.max_subcompactions, /* grandparents */ {}, + /* is manual */ true); + RegisterCompaction(c); + return c; + } + + CompactionInputFiles inputs; + inputs.level = input_level; + bool covering_the_whole_range = true; + + // All files are 'overlapping' in universal style compaction. + // We have to compact the entire range in one shot. + if (ioptions_.compaction_style == kCompactionStyleUniversal) { + begin = nullptr; + end = nullptr; + } + + vstorage->GetOverlappingInputs(input_level, begin, end, &inputs.files); + if (inputs.empty()) { + return nullptr; + } + + if ((input_level == 0) && (!level0_compactions_in_progress_.empty())) { + // Only one level 0 compaction allowed + TEST_SYNC_POINT("CompactionPicker::CompactRange:Conflict"); + *manual_conflict = true; + return nullptr; + } + + // Avoid compacting too much in one shot in case the range is large. + // But we cannot do this for level-0 since level-0 files can overlap + // and we must not pick one file and drop another older file if the + // two files overlap. + if (input_level > 0) { + const uint64_t limit = mutable_cf_options.max_compaction_bytes; + uint64_t total = 0; + for (size_t i = 0; i + 1 < inputs.size(); ++i) { + uint64_t s = inputs[i]->compensated_file_size; + total += s; + if (total >= limit) { + covering_the_whole_range = false; + inputs.files.resize(i + 1); + break; + } + } + } + assert(compact_range_options.target_path_id < + static_cast(ioptions_.cf_paths.size())); + + // for BOTTOM LEVEL compaction only, use max_file_num_to_ignore to filter out + // files that are created during the current compaction. + if (compact_range_options.bottommost_level_compaction == + BottommostLevelCompaction::kForceOptimized && + max_file_num_to_ignore != port::kMaxUint64) { + assert(input_level == output_level); + // inputs_shrunk holds a continuous subset of input files which were all + // created before the current manual compaction + std::vector inputs_shrunk; + size_t skip_input_index = inputs.size(); + for (size_t i = 0; i < inputs.size(); ++i) { + if (inputs[i]->fd.GetNumber() < max_file_num_to_ignore) { + inputs_shrunk.push_back(inputs[i]); + } else if (!inputs_shrunk.empty()) { + // inputs[i] was created during the current manual compaction and + // need to be skipped + skip_input_index = i; + break; + } + } + if (inputs_shrunk.empty()) { + return nullptr; + } + if (inputs.size() != inputs_shrunk.size()) { + inputs.files.swap(inputs_shrunk); + } + // set covering_the_whole_range to false if there is any file that need to + // be compacted in the range of inputs[skip_input_index+1, inputs.size()) + for (size_t i = skip_input_index + 1; i < inputs.size(); ++i) { + if (inputs[i]->fd.GetNumber() < max_file_num_to_ignore) { + covering_the_whole_range = false; + } + } + } + + InternalKey key_storage; + InternalKey* next_smallest = &key_storage; + if (ExpandInputsToCleanCut(cf_name, vstorage, &inputs, &next_smallest) == + false) { + // manual compaction is now multi-threaded, so it can + // happen that ExpandWhileOverlapping fails + // we handle it higher in RunManualCompaction + *manual_conflict = true; + return nullptr; + } + + if (covering_the_whole_range || !next_smallest) { + *compaction_end = nullptr; + } else { + **compaction_end = *next_smallest; + } + + CompactionInputFiles output_level_inputs; + if (output_level == ColumnFamilyData::kCompactToBaseLevel) { + assert(input_level == 0); + output_level = vstorage->base_level(); + assert(output_level > 0); + } + output_level_inputs.level = output_level; + if (input_level != output_level) { + int parent_index = -1; + if (!SetupOtherInputs(cf_name, mutable_cf_options, vstorage, &inputs, + &output_level_inputs, &parent_index, -1)) { + // manual compaction is now multi-threaded, so it can + // happen that SetupOtherInputs fails + // we handle it higher in RunManualCompaction + *manual_conflict = true; + return nullptr; + } + } + + std::vector compaction_inputs({inputs}); + if (!output_level_inputs.empty()) { + compaction_inputs.push_back(output_level_inputs); + } + for (size_t i = 0; i < compaction_inputs.size(); i++) { + if (AreFilesInCompaction(compaction_inputs[i].files)) { + *manual_conflict = true; + return nullptr; + } + } + + // 2 non-exclusive manual compactions could run at the same time producing + // overlaping outputs in the same level. + if (FilesRangeOverlapWithCompaction(compaction_inputs, output_level)) { + // This compaction output could potentially conflict with the output + // of a currently running compaction, we cannot run it. + *manual_conflict = true; + return nullptr; + } + + std::vector grandparents; + GetGrandparents(vstorage, inputs, output_level_inputs, &grandparents); + Compaction* compaction = new Compaction( + vstorage, ioptions_, mutable_cf_options, std::move(compaction_inputs), + output_level, + MaxFileSizeForLevel(mutable_cf_options, output_level, + ioptions_.compaction_style, vstorage->base_level(), + ioptions_.level_compaction_dynamic_level_bytes), + mutable_cf_options.max_compaction_bytes, + compact_range_options.target_path_id, + GetCompressionType(ioptions_, vstorage, mutable_cf_options, output_level, + vstorage->base_level()), + GetCompressionOptions(ioptions_, vstorage, output_level), + compact_range_options.max_subcompactions, std::move(grandparents), + /* is manual compaction */ true); + + TEST_SYNC_POINT_CALLBACK("CompactionPicker::CompactRange:Return", compaction); + RegisterCompaction(compaction); + + // Creating a compaction influences the compaction score because the score + // takes running compactions into account (by skipping files that are already + // being compacted). Since we just changed compaction score, we recalculate it + // here + vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options); + + return compaction; +} + +#ifndef ROCKSDB_LITE +namespace { +// Test whether two files have overlapping key-ranges. +bool HaveOverlappingKeyRanges(const Comparator* c, const SstFileMetaData& a, + const SstFileMetaData& b) { + if (c->Compare(a.smallestkey, b.smallestkey) >= 0) { + if (c->Compare(a.smallestkey, b.largestkey) <= 0) { + // b.smallestkey <= a.smallestkey <= b.largestkey + return true; + } + } else if (c->Compare(a.largestkey, b.smallestkey) >= 0) { + // a.smallestkey < b.smallestkey <= a.largestkey + return true; + } + if (c->Compare(a.largestkey, b.largestkey) <= 0) { + if (c->Compare(a.largestkey, b.smallestkey) >= 0) { + // b.smallestkey <= a.largestkey <= b.largestkey + return true; + } + } else if (c->Compare(a.smallestkey, b.largestkey) <= 0) { + // a.smallestkey <= b.largestkey < a.largestkey + return true; + } + return false; +} +} // namespace + +Status CompactionPicker::SanitizeCompactionInputFilesForAllLevels( + std::unordered_set* input_files, + const ColumnFamilyMetaData& cf_meta, const int output_level) const { + auto& levels = cf_meta.levels; + auto comparator = icmp_->user_comparator(); + + // TODO(yhchiang): add is_adjustable to CompactionOptions + + // the smallest and largest key of the current compaction input + std::string smallestkey; + std::string largestkey; + // a flag for initializing smallest and largest key + bool is_first = false; + const int kNotFound = -1; + + // For each level, it does the following things: + // 1. Find the first and the last compaction input files + // in the current level. + // 2. Include all files between the first and the last + // compaction input files. + // 3. Update the compaction key-range. + // 4. For all remaining levels, include files that have + // overlapping key-range with the compaction key-range. + for (int l = 0; l <= output_level; ++l) { + auto& current_files = levels[l].files; + int first_included = static_cast(current_files.size()); + int last_included = kNotFound; + + // identify the first and the last compaction input files + // in the current level. + for (size_t f = 0; f < current_files.size(); ++f) { + if (input_files->find(TableFileNameToNumber(current_files[f].name)) != + input_files->end()) { + first_included = std::min(first_included, static_cast(f)); + last_included = std::max(last_included, static_cast(f)); + if (is_first == false) { + smallestkey = current_files[f].smallestkey; + largestkey = current_files[f].largestkey; + is_first = true; + } + } + } + if (last_included == kNotFound) { + continue; + } + + if (l != 0) { + // expend the compaction input of the current level if it + // has overlapping key-range with other non-compaction input + // files in the same level. + while (first_included > 0) { + if (comparator->Compare(current_files[first_included - 1].largestkey, + current_files[first_included].smallestkey) < + 0) { + break; + } + first_included--; + } + + while (last_included < static_cast(current_files.size()) - 1) { + if (comparator->Compare(current_files[last_included + 1].smallestkey, + current_files[last_included].largestkey) > 0) { + break; + } + last_included++; + } + } else if (output_level > 0) { + last_included = static_cast(current_files.size() - 1); + } + + // include all files between the first and the last compaction input files. + for (int f = first_included; f <= last_included; ++f) { + if (current_files[f].being_compacted) { + return Status::Aborted("Necessary compaction input file " + + current_files[f].name + + " is currently being compacted."); + } + input_files->insert(TableFileNameToNumber(current_files[f].name)); + } + + // update smallest and largest key + if (l == 0) { + for (int f = first_included; f <= last_included; ++f) { + if (comparator->Compare(smallestkey, current_files[f].smallestkey) > + 0) { + smallestkey = current_files[f].smallestkey; + } + if (comparator->Compare(largestkey, current_files[f].largestkey) < 0) { + largestkey = current_files[f].largestkey; + } + } + } else { + if (comparator->Compare(smallestkey, + current_files[first_included].smallestkey) > 0) { + smallestkey = current_files[first_included].smallestkey; + } + if (comparator->Compare(largestkey, + current_files[last_included].largestkey) < 0) { + largestkey = current_files[last_included].largestkey; + } + } + + SstFileMetaData aggregated_file_meta; + aggregated_file_meta.smallestkey = smallestkey; + aggregated_file_meta.largestkey = largestkey; + + // For all lower levels, include all overlapping files. + // We need to add overlapping files from the current level too because even + // if there no input_files in level l, we would still need to add files + // which overlap with the range containing the input_files in levels 0 to l + // Level 0 doesn't need to be handled this way because files are sorted by + // time and not by key + for (int m = std::max(l, 1); m <= output_level; ++m) { + for (auto& next_lv_file : levels[m].files) { + if (HaveOverlappingKeyRanges(comparator, aggregated_file_meta, + next_lv_file)) { + if (next_lv_file.being_compacted) { + return Status::Aborted( + "File " + next_lv_file.name + + " that has overlapping key range with one of the compaction " + " input file is currently being compacted."); + } + input_files->insert(TableFileNameToNumber(next_lv_file.name)); + } + } + } + } + if (RangeOverlapWithCompaction(smallestkey, largestkey, output_level)) { + return Status::Aborted( + "A running compaction is writing to the same output level in an " + "overlapping key range"); + } + return Status::OK(); +} + +Status CompactionPicker::SanitizeCompactionInputFiles( + std::unordered_set* input_files, + const ColumnFamilyMetaData& cf_meta, const int output_level) const { + assert(static_cast(cf_meta.levels.size()) - 1 == + cf_meta.levels[cf_meta.levels.size() - 1].level); + if (output_level >= static_cast(cf_meta.levels.size())) { + return Status::InvalidArgument( + "Output level for column family " + cf_meta.name + + " must between [0, " + + ToString(cf_meta.levels[cf_meta.levels.size() - 1].level) + "]."); + } + + if (output_level > MaxOutputLevel()) { + return Status::InvalidArgument( + "Exceed the maximum output level defined by " + "the current compaction algorithm --- " + + ToString(MaxOutputLevel())); + } + + if (output_level < 0) { + return Status::InvalidArgument("Output level cannot be negative."); + } + + if (input_files->size() == 0) { + return Status::InvalidArgument( + "A compaction must contain at least one file."); + } + + Status s = SanitizeCompactionInputFilesForAllLevels(input_files, cf_meta, + output_level); + + if (!s.ok()) { + return s; + } + + // for all input files, check whether the file number matches + // any currently-existing files. + for (auto file_num : *input_files) { + bool found = false; + for (const auto& level_meta : cf_meta.levels) { + for (const auto& file_meta : level_meta.files) { + if (file_num == TableFileNameToNumber(file_meta.name)) { + if (file_meta.being_compacted) { + return Status::Aborted("Specified compaction input file " + + MakeTableFileName("", file_num) + + " is already being compacted."); + } + found = true; + break; + } + } + if (found) { + break; + } + } + if (!found) { + return Status::InvalidArgument( + "Specified compaction input file " + MakeTableFileName("", file_num) + + " does not exist in column family " + cf_meta.name + "."); + } + } + + return Status::OK(); +} +#endif // !ROCKSDB_LITE + +void CompactionPicker::RegisterCompaction(Compaction* c) { + if (c == nullptr) { + return; + } + assert(ioptions_.compaction_style != kCompactionStyleLevel || + c->output_level() == 0 || + !FilesRangeOverlapWithCompaction(*c->inputs(), c->output_level())); + if (c->start_level() == 0 || + ioptions_.compaction_style == kCompactionStyleUniversal) { + level0_compactions_in_progress_.insert(c); + } + compactions_in_progress_.insert(c); +} + +void CompactionPicker::UnregisterCompaction(Compaction* c) { + if (c == nullptr) { + return; + } + if (c->start_level() == 0 || + ioptions_.compaction_style == kCompactionStyleUniversal) { + level0_compactions_in_progress_.erase(c); + } + compactions_in_progress_.erase(c); +} + +void CompactionPicker::PickFilesMarkedForCompaction( + const std::string& cf_name, VersionStorageInfo* vstorage, int* start_level, + int* output_level, CompactionInputFiles* start_level_inputs) { + if (vstorage->FilesMarkedForCompaction().empty()) { + return; + } + + auto continuation = [&, cf_name](std::pair level_file) { + // If it's being compacted it has nothing to do here. + // If this assert() fails that means that some function marked some + // files as being_compacted, but didn't call ComputeCompactionScore() + assert(!level_file.second->being_compacted); + *start_level = level_file.first; + *output_level = + (*start_level == 0) ? vstorage->base_level() : *start_level + 1; + + if (*start_level == 0 && !level0_compactions_in_progress()->empty()) { + return false; + } + + start_level_inputs->files = {level_file.second}; + start_level_inputs->level = *start_level; + return ExpandInputsToCleanCut(cf_name, vstorage, start_level_inputs); + }; + + // take a chance on a random file first + Random64 rnd(/* seed */ reinterpret_cast(vstorage)); + size_t random_file_index = static_cast(rnd.Uniform( + static_cast(vstorage->FilesMarkedForCompaction().size()))); + + if (continuation(vstorage->FilesMarkedForCompaction()[random_file_index])) { + // found the compaction! + return; + } + + for (auto& level_file : vstorage->FilesMarkedForCompaction()) { + if (continuation(level_file)) { + // found the compaction! + return; + } + } + start_level_inputs->files.clear(); +} + +bool CompactionPicker::GetOverlappingL0Files( + VersionStorageInfo* vstorage, CompactionInputFiles* start_level_inputs, + int output_level, int* parent_index) { + // Two level 0 compaction won't run at the same time, so don't need to worry + // about files on level 0 being compacted. + assert(level0_compactions_in_progress()->empty()); + InternalKey smallest, largest; + GetRange(*start_level_inputs, &smallest, &largest); + // Note that the next call will discard the file we placed in + // c->inputs_[0] earlier and replace it with an overlapping set + // which will include the picked file. + start_level_inputs->files.clear(); + vstorage->GetOverlappingInputs(0, &smallest, &largest, + &(start_level_inputs->files)); + + // If we include more L0 files in the same compaction run it can + // cause the 'smallest' and 'largest' key to get extended to a + // larger range. So, re-invoke GetRange to get the new key range + GetRange(*start_level_inputs, &smallest, &largest); + if (IsRangeInCompaction(vstorage, &smallest, &largest, output_level, + parent_index)) { + return false; + } + assert(!start_level_inputs->files.empty()); + + return true; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_picker.h b/src/rocksdb/db/compaction/compaction_picker.h new file mode 100644 index 000000000..36d570e68 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_picker.h @@ -0,0 +1,313 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include +#include +#include +#include + +#include "db/compaction/compaction.h" +#include "db/version_set.h" +#include "options/cf_options.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// The file contains an abstract class CompactionPicker, and its two +// sub-classes LevelCompactionPicker and NullCompactionPicker, as +// well as some helper functions used by them. + +class LogBuffer; +class Compaction; +class VersionStorageInfo; +struct CompactionInputFiles; + +// An abstract class to pick compactions from an existing LSM-tree. +// +// Each compaction style inherits the class and implement the +// interface to form automatic compactions. If NeedCompaction() is true, +// then call PickCompaction() to find what files need to be compacted +// and where to put the output files. +// +// Non-virtual functions CompactRange() and CompactFiles() are used to +// pick files to compact based on users' DB::CompactRange() and +// DB::CompactFiles() requests, respectively. There is little +// compaction style specific logic for them. +class CompactionPicker { + public: + CompactionPicker(const ImmutableCFOptions& ioptions, + const InternalKeyComparator* icmp); + virtual ~CompactionPicker(); + + // Pick level and inputs for a new compaction. + // Returns nullptr if there is no compaction to be done. + // Otherwise returns a pointer to a heap-allocated object that + // describes the compaction. Caller should delete the result. + virtual Compaction* PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, LogBuffer* log_buffer, + SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) = 0; + + // Return a compaction object for compacting the range [begin,end] in + // the specified level. Returns nullptr if there is nothing in that + // level that overlaps the specified range. Caller should delete + // the result. + // + // The returned Compaction might not include the whole requested range. + // In that case, compaction_end will be set to the next key that needs + // compacting. In case the compaction will compact the whole range, + // compaction_end will be set to nullptr. + // Client is responsible for compaction_end storage -- when called, + // *compaction_end should point to valid InternalKey! + virtual Compaction* CompactRange( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, int input_level, int output_level, + const CompactRangeOptions& compact_range_options, + const InternalKey* begin, const InternalKey* end, + InternalKey** compaction_end, bool* manual_conflict, + uint64_t max_file_num_to_ignore); + + // The maximum allowed output level. Default value is NumberLevels() - 1. + virtual int MaxOutputLevel() const { return NumberLevels() - 1; } + + virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const = 0; + +// Sanitize the input set of compaction input files. +// When the input parameters do not describe a valid compaction, the +// function will try to fix the input_files by adding necessary +// files. If it's not possible to conver an invalid input_files +// into a valid one by adding more files, the function will return a +// non-ok status with specific reason. +#ifndef ROCKSDB_LITE + Status SanitizeCompactionInputFiles(std::unordered_set* input_files, + const ColumnFamilyMetaData& cf_meta, + const int output_level) const; +#endif // ROCKSDB_LITE + + // Free up the files that participated in a compaction + // + // Requirement: DB mutex held + void ReleaseCompactionFiles(Compaction* c, Status status); + + // Returns true if any one of the specified files are being compacted + bool AreFilesInCompaction(const std::vector& files); + + // Takes a list of CompactionInputFiles and returns a (manual) Compaction + // object. + // + // Caller must provide a set of input files that has been passed through + // `SanitizeCompactionInputFiles` earlier. The lock should not be released + // between that call and this one. + Compaction* CompactFiles(const CompactionOptions& compact_options, + const std::vector& input_files, + int output_level, VersionStorageInfo* vstorage, + const MutableCFOptions& mutable_cf_options, + uint32_t output_path_id); + + // Converts a set of compaction input file numbers into + // a list of CompactionInputFiles. + Status GetCompactionInputsFromFileNumbers( + std::vector* input_files, + std::unordered_set* input_set, + const VersionStorageInfo* vstorage, + const CompactionOptions& compact_options) const; + + // Is there currently a compaction involving level 0 taking place + bool IsLevel0CompactionInProgress() const { + return !level0_compactions_in_progress_.empty(); + } + + // Return true if the passed key range overlap with a compaction output + // that is currently running. + bool RangeOverlapWithCompaction(const Slice& smallest_user_key, + const Slice& largest_user_key, + int level) const; + + // Stores the minimal range that covers all entries in inputs in + // *smallest, *largest. + // REQUIRES: inputs is not empty + void GetRange(const CompactionInputFiles& inputs, InternalKey* smallest, + InternalKey* largest) const; + + // Stores the minimal range that covers all entries in inputs1 and inputs2 + // in *smallest, *largest. + // REQUIRES: inputs is not empty + void GetRange(const CompactionInputFiles& inputs1, + const CompactionInputFiles& inputs2, InternalKey* smallest, + InternalKey* largest) const; + + // Stores the minimal range that covers all entries in inputs + // in *smallest, *largest. + // REQUIRES: inputs is not empty (at least on entry have one file) + void GetRange(const std::vector& inputs, + InternalKey* smallest, InternalKey* largest) const; + + int NumberLevels() const { return ioptions_.num_levels; } + + // Add more files to the inputs on "level" to make sure that + // no newer version of a key is compacted to "level+1" while leaving an older + // version in a "level". Otherwise, any Get() will search "level" first, + // and will likely return an old/stale value for the key, since it always + // searches in increasing order of level to find the value. This could + // also scramble the order of merge operands. This function should be + // called any time a new Compaction is created, and its inputs_[0] are + // populated. + // + // Will return false if it is impossible to apply this compaction. + bool ExpandInputsToCleanCut(const std::string& cf_name, + VersionStorageInfo* vstorage, + CompactionInputFiles* inputs, + InternalKey** next_smallest = nullptr); + + // Returns true if any one of the parent files are being compacted + bool IsRangeInCompaction(VersionStorageInfo* vstorage, + const InternalKey* smallest, + const InternalKey* largest, int level, int* index); + + // Returns true if the key range that `inputs` files cover overlap with the + // key range of a currently running compaction. + bool FilesRangeOverlapWithCompaction( + const std::vector& inputs, int level) const; + + bool SetupOtherInputs(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, + CompactionInputFiles* inputs, + CompactionInputFiles* output_level_inputs, + int* parent_index, int base_index); + + void GetGrandparents(VersionStorageInfo* vstorage, + const CompactionInputFiles& inputs, + const CompactionInputFiles& output_level_inputs, + std::vector* grandparents); + + void PickFilesMarkedForCompaction(const std::string& cf_name, + VersionStorageInfo* vstorage, + int* start_level, int* output_level, + CompactionInputFiles* start_level_inputs); + + bool GetOverlappingL0Files(VersionStorageInfo* vstorage, + CompactionInputFiles* start_level_inputs, + int output_level, int* parent_index); + + // Register this compaction in the set of running compactions + void RegisterCompaction(Compaction* c); + + // Remove this compaction from the set of running compactions + void UnregisterCompaction(Compaction* c); + + std::set* level0_compactions_in_progress() { + return &level0_compactions_in_progress_; + } + std::unordered_set* compactions_in_progress() { + return &compactions_in_progress_; + } + + protected: + const ImmutableCFOptions& ioptions_; + +// A helper function to SanitizeCompactionInputFiles() that +// sanitizes "input_files" by adding necessary files. +#ifndef ROCKSDB_LITE + virtual Status SanitizeCompactionInputFilesForAllLevels( + std::unordered_set* input_files, + const ColumnFamilyMetaData& cf_meta, const int output_level) const; +#endif // ROCKSDB_LITE + + // Keeps track of all compactions that are running on Level0. + // Protected by DB mutex + std::set level0_compactions_in_progress_; + + // Keeps track of all compactions that are running. + // Protected by DB mutex + std::unordered_set compactions_in_progress_; + + const InternalKeyComparator* const icmp_; +}; + +#ifndef ROCKSDB_LITE +// A dummy compaction that never triggers any automatic +// compaction. +class NullCompactionPicker : public CompactionPicker { + public: + NullCompactionPicker(const ImmutableCFOptions& ioptions, + const InternalKeyComparator* icmp) + : CompactionPicker(ioptions, icmp) {} + virtual ~NullCompactionPicker() {} + + // Always return "nullptr" + Compaction* PickCompaction( + const std::string& /*cf_name*/, + const MutableCFOptions& /*mutable_cf_options*/, + VersionStorageInfo* /*vstorage*/, LogBuffer* /* log_buffer */, + SequenceNumber /* earliest_memtable_seqno */) override { + return nullptr; + } + + // Always return "nullptr" + Compaction* CompactRange(const std::string& /*cf_name*/, + const MutableCFOptions& /*mutable_cf_options*/, + VersionStorageInfo* /*vstorage*/, + int /*input_level*/, int /*output_level*/, + const CompactRangeOptions& /*compact_range_options*/, + const InternalKey* /*begin*/, + const InternalKey* /*end*/, + InternalKey** /*compaction_end*/, + bool* /*manual_conflict*/, + uint64_t /*max_file_num_to_ignore*/) override { + return nullptr; + } + + // Always returns false. + virtual bool NeedsCompaction( + const VersionStorageInfo* /*vstorage*/) const override { + return false; + } +}; +#endif // !ROCKSDB_LITE + +// Attempts to find an intra L0 compaction conforming to the given parameters. +// +// @param level_files Metadata for L0 files. +// @param min_files_to_compact Minimum number of files required to +// do the compaction. +// @param max_compact_bytes_per_del_file Maximum average size in bytes per +// file that is going to get deleted by +// the compaction. +// @param max_compaction_bytes Maximum total size in bytes (in terms +// of compensated file size) for files +// to be compacted. +// @param [out] comp_inputs If a compaction was found, will be +// initialized with corresponding input +// files. Cannot be nullptr. +// +// @return true iff compaction was found. +bool FindIntraL0Compaction( + const std::vector& level_files, size_t min_files_to_compact, + uint64_t max_compact_bytes_per_del_file, uint64_t max_compaction_bytes, + CompactionInputFiles* comp_inputs, + SequenceNumber earliest_mem_seqno = kMaxSequenceNumber); + +CompressionType GetCompressionType(const ImmutableCFOptions& ioptions, + const VersionStorageInfo* vstorage, + const MutableCFOptions& mutable_cf_options, + int level, int base_level, + const bool enable_compression = true); + +CompressionOptions GetCompressionOptions(const ImmutableCFOptions& ioptions, + const VersionStorageInfo* vstorage, + int level, + const bool enable_compression = true); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_picker_fifo.cc b/src/rocksdb/db/compaction/compaction_picker_fifo.cc new file mode 100644 index 000000000..b148aadc2 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_picker_fifo.cc @@ -0,0 +1,244 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction_picker_fifo.h" +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include "db/column_family.h" +#include "logging/log_buffer.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +uint64_t GetTotalFilesSize(const std::vector& files) { + uint64_t total_size = 0; + for (const auto& f : files) { + total_size += f->fd.file_size; + } + return total_size; +} +} // anonymous namespace + +bool FIFOCompactionPicker::NeedsCompaction( + const VersionStorageInfo* vstorage) const { + const int kLevel0 = 0; + return vstorage->CompactionScore(kLevel0) >= 1; +} + +Compaction* FIFOCompactionPicker::PickTTLCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, LogBuffer* log_buffer) { + assert(mutable_cf_options.ttl > 0); + + const int kLevel0 = 0; + const std::vector& level_files = vstorage->LevelFiles(kLevel0); + uint64_t total_size = GetTotalFilesSize(level_files); + + int64_t _current_time; + auto status = ioptions_.env->GetCurrentTime(&_current_time); + if (!status.ok()) { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] FIFO compaction: Couldn't get current time: %s. " + "Not doing compactions based on TTL. ", + cf_name.c_str(), status.ToString().c_str()); + return nullptr; + } + const uint64_t current_time = static_cast(_current_time); + + if (!level0_compactions_in_progress_.empty()) { + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] FIFO compaction: Already executing compaction. No need " + "to run parallel compactions since compactions are very fast", + cf_name.c_str()); + return nullptr; + } + + std::vector inputs; + inputs.emplace_back(); + inputs[0].level = 0; + + // avoid underflow + if (current_time > mutable_cf_options.ttl) { + for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) { + FileMetaData* f = *ritr; + if (f->fd.table_reader && f->fd.table_reader->GetTableProperties()) { + uint64_t creation_time = + f->fd.table_reader->GetTableProperties()->creation_time; + if (creation_time == 0 || + creation_time >= (current_time - mutable_cf_options.ttl)) { + break; + } + total_size -= f->compensated_file_size; + inputs[0].files.push_back(f); + } + } + } + + // Return a nullptr and proceed to size-based FIFO compaction if: + // 1. there are no files older than ttl OR + // 2. there are a few files older than ttl, but deleting them will not bring + // the total size to be less than max_table_files_size threshold. + if (inputs[0].files.empty() || + total_size > + mutable_cf_options.compaction_options_fifo.max_table_files_size) { + return nullptr; + } + + for (const auto& f : inputs[0].files) { + uint64_t creation_time = 0; + if (f && f->fd.table_reader && f->fd.table_reader->GetTableProperties()) { + creation_time = f->fd.table_reader->GetTableProperties()->creation_time; + } + ROCKS_LOG_BUFFER(log_buffer, + "[%s] FIFO compaction: picking file %" PRIu64 + " with creation time %" PRIu64 " for deletion", + cf_name.c_str(), f->fd.GetNumber(), creation_time); + } + + Compaction* c = new Compaction( + vstorage, ioptions_, mutable_cf_options, std::move(inputs), 0, 0, 0, 0, + kNoCompression, ioptions_.compression_opts, /* max_subcompactions */ 0, + {}, /* is manual */ false, vstorage->CompactionScore(0), + /* is deletion compaction */ true, CompactionReason::kFIFOTtl); + return c; +} + +Compaction* FIFOCompactionPicker::PickSizeCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, LogBuffer* log_buffer) { + const int kLevel0 = 0; + const std::vector& level_files = vstorage->LevelFiles(kLevel0); + uint64_t total_size = GetTotalFilesSize(level_files); + + if (total_size <= + mutable_cf_options.compaction_options_fifo.max_table_files_size || + level_files.size() == 0) { + // total size not exceeded + if (mutable_cf_options.compaction_options_fifo.allow_compaction && + level_files.size() > 0) { + CompactionInputFiles comp_inputs; + // try to prevent same files from being compacted multiple times, which + // could produce large files that may never TTL-expire. Achieve this by + // disallowing compactions with files larger than memtable (inflate its + // size by 10% to account for uncompressed L0 files that may have size + // slightly greater than memtable size limit). + size_t max_compact_bytes_per_del_file = + static_cast(MultiplyCheckOverflow( + static_cast(mutable_cf_options.write_buffer_size), + 1.1)); + if (FindIntraL0Compaction( + level_files, + mutable_cf_options + .level0_file_num_compaction_trigger /* min_files_to_compact */ + , + max_compact_bytes_per_del_file, + mutable_cf_options.max_compaction_bytes, &comp_inputs)) { + Compaction* c = new Compaction( + vstorage, ioptions_, mutable_cf_options, {comp_inputs}, 0, + 16 * 1024 * 1024 /* output file size limit */, + 0 /* max compaction bytes, not applicable */, + 0 /* output path ID */, mutable_cf_options.compression, + ioptions_.compression_opts, 0 /* max_subcompactions */, {}, + /* is manual */ false, vstorage->CompactionScore(0), + /* is deletion compaction */ false, + CompactionReason::kFIFOReduceNumFiles); + return c; + } + } + + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] FIFO compaction: nothing to do. Total size %" PRIu64 + ", max size %" PRIu64 "\n", + cf_name.c_str(), total_size, + mutable_cf_options.compaction_options_fifo.max_table_files_size); + return nullptr; + } + + if (!level0_compactions_in_progress_.empty()) { + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] FIFO compaction: Already executing compaction. No need " + "to run parallel compactions since compactions are very fast", + cf_name.c_str()); + return nullptr; + } + + std::vector inputs; + inputs.emplace_back(); + inputs[0].level = 0; + + for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) { + auto f = *ritr; + total_size -= f->compensated_file_size; + inputs[0].files.push_back(f); + char tmp_fsize[16]; + AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize)); + ROCKS_LOG_BUFFER(log_buffer, + "[%s] FIFO compaction: picking file %" PRIu64 + " with size %s for deletion", + cf_name.c_str(), f->fd.GetNumber(), tmp_fsize); + if (total_size <= + mutable_cf_options.compaction_options_fifo.max_table_files_size) { + break; + } + } + + Compaction* c = new Compaction( + vstorage, ioptions_, mutable_cf_options, std::move(inputs), 0, 0, 0, 0, + kNoCompression, ioptions_.compression_opts, /* max_subcompactions */ 0, + {}, /* is manual */ false, vstorage->CompactionScore(0), + /* is deletion compaction */ true, CompactionReason::kFIFOMaxSize); + return c; +} + +Compaction* FIFOCompactionPicker::PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, LogBuffer* log_buffer, + SequenceNumber /*earliest_memtable_seqno*/) { + assert(vstorage->num_levels() == 1); + + Compaction* c = nullptr; + if (mutable_cf_options.ttl > 0) { + c = PickTTLCompaction(cf_name, mutable_cf_options, vstorage, log_buffer); + } + if (c == nullptr) { + c = PickSizeCompaction(cf_name, mutable_cf_options, vstorage, log_buffer); + } + RegisterCompaction(c); + return c; +} + +Compaction* FIFOCompactionPicker::CompactRange( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, int input_level, int output_level, + const CompactRangeOptions& /*compact_range_options*/, + const InternalKey* /*begin*/, const InternalKey* /*end*/, + InternalKey** compaction_end, bool* /*manual_conflict*/, + uint64_t /*max_file_num_to_ignore*/) { +#ifdef NDEBUG + (void)input_level; + (void)output_level; +#endif + assert(input_level == 0); + assert(output_level == 0); + *compaction_end = nullptr; + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.info_log); + Compaction* c = + PickCompaction(cf_name, mutable_cf_options, vstorage, &log_buffer); + log_buffer.FlushBufferToLog(); + return c; +} + +} // namespace ROCKSDB_NAMESPACE +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/compaction/compaction_picker_fifo.h b/src/rocksdb/db/compaction/compaction_picker_fifo.h new file mode 100644 index 000000000..eb786e5ac --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_picker_fifo.h @@ -0,0 +1,53 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#ifndef ROCKSDB_LITE + +#include "db/compaction/compaction_picker.h" + +namespace ROCKSDB_NAMESPACE { +class FIFOCompactionPicker : public CompactionPicker { + public: + FIFOCompactionPicker(const ImmutableCFOptions& ioptions, + const InternalKeyComparator* icmp) + : CompactionPicker(ioptions, icmp) {} + + virtual Compaction* PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* version, LogBuffer* log_buffer, + SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override; + + virtual Compaction* CompactRange( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, int input_level, int output_level, + const CompactRangeOptions& compact_range_options, + const InternalKey* begin, const InternalKey* end, + InternalKey** compaction_end, bool* manual_conflict, + uint64_t max_file_num_to_ignore) override; + + // The maximum allowed output level. Always returns 0. + virtual int MaxOutputLevel() const override { return 0; } + + virtual bool NeedsCompaction( + const VersionStorageInfo* vstorage) const override; + + private: + Compaction* PickTTLCompaction(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* version, + LogBuffer* log_buffer); + + Compaction* PickSizeCompaction(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* version, + LogBuffer* log_buffer); +}; +} // namespace ROCKSDB_NAMESPACE +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/compaction/compaction_picker_level.cc b/src/rocksdb/db/compaction/compaction_picker_level.cc new file mode 100644 index 000000000..012edd080 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_picker_level.cc @@ -0,0 +1,558 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include + +#include "db/compaction/compaction_picker_level.h" +#include "logging/log_buffer.h" +#include "test_util/sync_point.h" + +namespace ROCKSDB_NAMESPACE { + +bool LevelCompactionPicker::NeedsCompaction( + const VersionStorageInfo* vstorage) const { + if (!vstorage->ExpiredTtlFiles().empty()) { + return true; + } + if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) { + return true; + } + if (!vstorage->BottommostFilesMarkedForCompaction().empty()) { + return true; + } + if (!vstorage->FilesMarkedForCompaction().empty()) { + return true; + } + for (int i = 0; i <= vstorage->MaxInputLevel(); i++) { + if (vstorage->CompactionScore(i) >= 1) { + return true; + } + } + return false; +} + +namespace { +// A class to build a leveled compaction step-by-step. +class LevelCompactionBuilder { + public: + LevelCompactionBuilder(const std::string& cf_name, + VersionStorageInfo* vstorage, + SequenceNumber earliest_mem_seqno, + CompactionPicker* compaction_picker, + LogBuffer* log_buffer, + const MutableCFOptions& mutable_cf_options, + const ImmutableCFOptions& ioptions) + : cf_name_(cf_name), + vstorage_(vstorage), + earliest_mem_seqno_(earliest_mem_seqno), + compaction_picker_(compaction_picker), + log_buffer_(log_buffer), + mutable_cf_options_(mutable_cf_options), + ioptions_(ioptions) {} + + // Pick and return a compaction. + Compaction* PickCompaction(); + + // Pick the initial files to compact to the next level. (or together + // in Intra-L0 compactions) + void SetupInitialFiles(); + + // If the initial files are from L0 level, pick other L0 + // files if needed. + bool SetupOtherL0FilesIfNeeded(); + + // Based on initial files, setup other files need to be compacted + // in this compaction, accordingly. + bool SetupOtherInputsIfNeeded(); + + Compaction* GetCompaction(); + + // For the specfied level, pick a file that we want to compact. + // Returns false if there is no file to compact. + // If it returns true, inputs->files.size() will be exactly one. + // If level is 0 and there is already a compaction on that level, this + // function will return false. + bool PickFileToCompact(); + + // For L0->L0, picks the longest span of files that aren't currently + // undergoing compaction for which work-per-deleted-file decreases. The span + // always starts from the newest L0 file. + // + // Intra-L0 compaction is independent of all other files, so it can be + // performed even when L0->base_level compactions are blocked. + // + // Returns true if `inputs` is populated with a span of files to be compacted; + // otherwise, returns false. + bool PickIntraL0Compaction(); + + void PickExpiredTtlFiles(); + + void PickFilesMarkedForPeriodicCompaction(); + + const std::string& cf_name_; + VersionStorageInfo* vstorage_; + SequenceNumber earliest_mem_seqno_; + CompactionPicker* compaction_picker_; + LogBuffer* log_buffer_; + int start_level_ = -1; + int output_level_ = -1; + int parent_index_ = -1; + int base_index_ = -1; + double start_level_score_ = 0; + bool is_manual_ = false; + CompactionInputFiles start_level_inputs_; + std::vector compaction_inputs_; + CompactionInputFiles output_level_inputs_; + std::vector grandparents_; + CompactionReason compaction_reason_ = CompactionReason::kUnknown; + + const MutableCFOptions& mutable_cf_options_; + const ImmutableCFOptions& ioptions_; + // Pick a path ID to place a newly generated file, with its level + static uint32_t GetPathId(const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options, + int level); + + static const int kMinFilesForIntraL0Compaction = 4; +}; + +void LevelCompactionBuilder::PickExpiredTtlFiles() { + if (vstorage_->ExpiredTtlFiles().empty()) { + return; + } + + auto continuation = [&](std::pair level_file) { + // If it's being compacted it has nothing to do here. + // If this assert() fails that means that some function marked some + // files as being_compacted, but didn't call ComputeCompactionScore() + assert(!level_file.second->being_compacted); + start_level_ = level_file.first; + output_level_ = + (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1; + + if ((start_level_ == vstorage_->num_non_empty_levels() - 1) || + (start_level_ == 0 && + !compaction_picker_->level0_compactions_in_progress()->empty())) { + return false; + } + + start_level_inputs_.files = {level_file.second}; + start_level_inputs_.level = start_level_; + return compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &start_level_inputs_); + }; + + for (auto& level_file : vstorage_->ExpiredTtlFiles()) { + if (continuation(level_file)) { + // found the compaction! + return; + } + } + + start_level_inputs_.files.clear(); +} + +void LevelCompactionBuilder::PickFilesMarkedForPeriodicCompaction() { + if (vstorage_->FilesMarkedForPeriodicCompaction().empty()) { + return; + } + + auto continuation = [&](std::pair level_file) { + // If it's being compacted it has nothing to do here. + // If this assert() fails that means that some function marked some + // files as being_compacted, but didn't call ComputeCompactionScore() + assert(!level_file.second->being_compacted); + output_level_ = start_level_ = level_file.first; + + if (start_level_ == 0 && + !compaction_picker_->level0_compactions_in_progress()->empty()) { + return false; + } + + start_level_inputs_.files = {level_file.second}; + start_level_inputs_.level = start_level_; + return compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &start_level_inputs_); + }; + + for (auto& level_file : vstorage_->FilesMarkedForPeriodicCompaction()) { + if (continuation(level_file)) { + // found the compaction! + return; + } + } + + start_level_inputs_.files.clear(); +} + +void LevelCompactionBuilder::SetupInitialFiles() { + // Find the compactions by size on all levels. + bool skipped_l0_to_base = false; + for (int i = 0; i < compaction_picker_->NumberLevels() - 1; i++) { + start_level_score_ = vstorage_->CompactionScore(i); + start_level_ = vstorage_->CompactionScoreLevel(i); + assert(i == 0 || start_level_score_ <= vstorage_->CompactionScore(i - 1)); + if (start_level_score_ >= 1) { + if (skipped_l0_to_base && start_level_ == vstorage_->base_level()) { + // If L0->base_level compaction is pending, don't schedule further + // compaction from base level. Otherwise L0->base_level compaction + // may starve. + continue; + } + output_level_ = + (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1; + if (PickFileToCompact()) { + // found the compaction! + if (start_level_ == 0) { + // L0 score = `num L0 files` / `level0_file_num_compaction_trigger` + compaction_reason_ = CompactionReason::kLevelL0FilesNum; + } else { + // L1+ score = `Level files size` / `MaxBytesForLevel` + compaction_reason_ = CompactionReason::kLevelMaxLevelSize; + } + break; + } else { + // didn't find the compaction, clear the inputs + start_level_inputs_.clear(); + if (start_level_ == 0) { + skipped_l0_to_base = true; + // L0->base_level may be blocked due to ongoing L0->base_level + // compactions. It may also be blocked by an ongoing compaction from + // base_level downwards. + // + // In these cases, to reduce L0 file count and thus reduce likelihood + // of write stalls, we can attempt compacting a span of files within + // L0. + if (PickIntraL0Compaction()) { + output_level_ = 0; + compaction_reason_ = CompactionReason::kLevelL0FilesNum; + break; + } + } + } + } + } + + // if we didn't find a compaction, check if there are any files marked for + // compaction + if (start_level_inputs_.empty()) { + parent_index_ = base_index_ = -1; + + compaction_picker_->PickFilesMarkedForCompaction( + cf_name_, vstorage_, &start_level_, &output_level_, + &start_level_inputs_); + if (!start_level_inputs_.empty()) { + is_manual_ = true; + compaction_reason_ = CompactionReason::kFilesMarkedForCompaction; + return; + } + } + + // Bottommost Files Compaction on deleting tombstones + if (start_level_inputs_.empty()) { + size_t i; + for (i = 0; i < vstorage_->BottommostFilesMarkedForCompaction().size(); + ++i) { + auto& level_and_file = vstorage_->BottommostFilesMarkedForCompaction()[i]; + assert(!level_and_file.second->being_compacted); + start_level_inputs_.level = output_level_ = start_level_ = + level_and_file.first; + start_level_inputs_.files = {level_and_file.second}; + if (compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &start_level_inputs_)) { + break; + } + } + if (i == vstorage_->BottommostFilesMarkedForCompaction().size()) { + start_level_inputs_.clear(); + } else { + assert(!start_level_inputs_.empty()); + compaction_reason_ = CompactionReason::kBottommostFiles; + return; + } + } + + // TTL Compaction + if (start_level_inputs_.empty()) { + PickExpiredTtlFiles(); + if (!start_level_inputs_.empty()) { + compaction_reason_ = CompactionReason::kTtl; + return; + } + } + + // Periodic Compaction + if (start_level_inputs_.empty()) { + PickFilesMarkedForPeriodicCompaction(); + if (!start_level_inputs_.empty()) { + compaction_reason_ = CompactionReason::kPeriodicCompaction; + return; + } + } +} + +bool LevelCompactionBuilder::SetupOtherL0FilesIfNeeded() { + if (start_level_ == 0 && output_level_ != 0) { + return compaction_picker_->GetOverlappingL0Files( + vstorage_, &start_level_inputs_, output_level_, &parent_index_); + } + return true; +} + +bool LevelCompactionBuilder::SetupOtherInputsIfNeeded() { + // Setup input files from output level. For output to L0, we only compact + // spans of files that do not interact with any pending compactions, so don't + // need to consider other levels. + if (output_level_ != 0) { + output_level_inputs_.level = output_level_; + if (!compaction_picker_->SetupOtherInputs( + cf_name_, mutable_cf_options_, vstorage_, &start_level_inputs_, + &output_level_inputs_, &parent_index_, base_index_)) { + return false; + } + + compaction_inputs_.push_back(start_level_inputs_); + if (!output_level_inputs_.empty()) { + compaction_inputs_.push_back(output_level_inputs_); + } + + // In some edge cases we could pick a compaction that will be compacting + // a key range that overlap with another running compaction, and both + // of them have the same output level. This could happen if + // (1) we are running a non-exclusive manual compaction + // (2) AddFile ingest a new file into the LSM tree + // We need to disallow this from happening. + if (compaction_picker_->FilesRangeOverlapWithCompaction(compaction_inputs_, + output_level_)) { + // This compaction output could potentially conflict with the output + // of a currently running compaction, we cannot run it. + return false; + } + compaction_picker_->GetGrandparents(vstorage_, start_level_inputs_, + output_level_inputs_, &grandparents_); + } else { + compaction_inputs_.push_back(start_level_inputs_); + } + return true; +} + +Compaction* LevelCompactionBuilder::PickCompaction() { + // Pick up the first file to start compaction. It may have been extended + // to a clean cut. + SetupInitialFiles(); + if (start_level_inputs_.empty()) { + return nullptr; + } + assert(start_level_ >= 0 && output_level_ >= 0); + + // If it is a L0 -> base level compaction, we need to set up other L0 + // files if needed. + if (!SetupOtherL0FilesIfNeeded()) { + return nullptr; + } + + // Pick files in the output level and expand more files in the start level + // if needed. + if (!SetupOtherInputsIfNeeded()) { + return nullptr; + } + + // Form a compaction object containing the files we picked. + Compaction* c = GetCompaction(); + + TEST_SYNC_POINT_CALLBACK("LevelCompactionPicker::PickCompaction:Return", c); + + return c; +} + +Compaction* LevelCompactionBuilder::GetCompaction() { + auto c = new Compaction( + vstorage_, ioptions_, mutable_cf_options_, std::move(compaction_inputs_), + output_level_, + MaxFileSizeForLevel(mutable_cf_options_, output_level_, + ioptions_.compaction_style, vstorage_->base_level(), + ioptions_.level_compaction_dynamic_level_bytes), + mutable_cf_options_.max_compaction_bytes, + GetPathId(ioptions_, mutable_cf_options_, output_level_), + GetCompressionType(ioptions_, vstorage_, mutable_cf_options_, + output_level_, vstorage_->base_level()), + GetCompressionOptions(ioptions_, vstorage_, output_level_), + /* max_subcompactions */ 0, std::move(grandparents_), is_manual_, + start_level_score_, false /* deletion_compaction */, compaction_reason_); + + // If it's level 0 compaction, make sure we don't execute any other level 0 + // compactions in parallel + compaction_picker_->RegisterCompaction(c); + + // Creating a compaction influences the compaction score because the score + // takes running compactions into account (by skipping files that are already + // being compacted). Since we just changed compaction score, we recalculate it + // here + vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_); + return c; +} + +/* + * Find the optimal path to place a file + * Given a level, finds the path where levels up to it will fit in levels + * up to and including this path + */ +uint32_t LevelCompactionBuilder::GetPathId( + const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options, int level) { + uint32_t p = 0; + assert(!ioptions.cf_paths.empty()); + + // size remaining in the most recent path + uint64_t current_path_size = ioptions.cf_paths[0].target_size; + + uint64_t level_size; + int cur_level = 0; + + // max_bytes_for_level_base denotes L1 size. + // We estimate L0 size to be the same as L1. + level_size = mutable_cf_options.max_bytes_for_level_base; + + // Last path is the fallback + while (p < ioptions.cf_paths.size() - 1) { + if (level_size <= current_path_size) { + if (cur_level == level) { + // Does desired level fit in this path? + return p; + } else { + current_path_size -= level_size; + if (cur_level > 0) { + if (ioptions.level_compaction_dynamic_level_bytes) { + // Currently, level_compaction_dynamic_level_bytes is ignored when + // multiple db paths are specified. https://github.com/facebook/ + // rocksdb/blob/master/db/column_family.cc. + // Still, adding this check to avoid accidentally using + // max_bytes_for_level_multiplier_additional + level_size = static_cast( + level_size * mutable_cf_options.max_bytes_for_level_multiplier); + } else { + level_size = static_cast( + level_size * mutable_cf_options.max_bytes_for_level_multiplier * + mutable_cf_options.MaxBytesMultiplerAdditional(cur_level)); + } + } + cur_level++; + continue; + } + } + p++; + current_path_size = ioptions.cf_paths[p].target_size; + } + return p; +} + +bool LevelCompactionBuilder::PickFileToCompact() { + // level 0 files are overlapping. So we cannot pick more + // than one concurrent compactions at this level. This + // could be made better by looking at key-ranges that are + // being compacted at level 0. + if (start_level_ == 0 && + !compaction_picker_->level0_compactions_in_progress()->empty()) { + TEST_SYNC_POINT("LevelCompactionPicker::PickCompactionBySize:0"); + return false; + } + + start_level_inputs_.clear(); + + assert(start_level_ >= 0); + + // Pick the largest file in this level that is not already + // being compacted + const std::vector& file_size = + vstorage_->FilesByCompactionPri(start_level_); + const std::vector& level_files = + vstorage_->LevelFiles(start_level_); + + unsigned int cmp_idx; + for (cmp_idx = vstorage_->NextCompactionIndex(start_level_); + cmp_idx < file_size.size(); cmp_idx++) { + int index = file_size[cmp_idx]; + auto* f = level_files[index]; + + // do not pick a file to compact if it is being compacted + // from n-1 level. + if (f->being_compacted) { + continue; + } + + start_level_inputs_.files.push_back(f); + start_level_inputs_.level = start_level_; + if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &start_level_inputs_) || + compaction_picker_->FilesRangeOverlapWithCompaction( + {start_level_inputs_}, output_level_)) { + // A locked (pending compaction) input-level file was pulled in due to + // user-key overlap. + start_level_inputs_.clear(); + continue; + } + + // Now that input level is fully expanded, we check whether any output files + // are locked due to pending compaction. + // + // Note we rely on ExpandInputsToCleanCut() to tell us whether any output- + // level files are locked, not just the extra ones pulled in for user-key + // overlap. + InternalKey smallest, largest; + compaction_picker_->GetRange(start_level_inputs_, &smallest, &largest); + CompactionInputFiles output_level_inputs; + output_level_inputs.level = output_level_; + vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest, + &output_level_inputs.files); + if (!output_level_inputs.empty() && + !compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &output_level_inputs)) { + start_level_inputs_.clear(); + continue; + } + base_index_ = index; + break; + } + + // store where to start the iteration in the next call to PickCompaction + vstorage_->SetNextCompactionIndex(start_level_, cmp_idx); + + return start_level_inputs_.size() > 0; +} + +bool LevelCompactionBuilder::PickIntraL0Compaction() { + start_level_inputs_.clear(); + const std::vector& level_files = + vstorage_->LevelFiles(0 /* level */); + if (level_files.size() < + static_cast( + mutable_cf_options_.level0_file_num_compaction_trigger + 2) || + level_files[0]->being_compacted) { + // If L0 isn't accumulating much files beyond the regular trigger, don't + // resort to L0->L0 compaction yet. + return false; + } + return FindIntraL0Compaction(level_files, kMinFilesForIntraL0Compaction, + port::kMaxUint64, + mutable_cf_options_.max_compaction_bytes, + &start_level_inputs_, earliest_mem_seqno_); +} +} // namespace + +Compaction* LevelCompactionPicker::PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, LogBuffer* log_buffer, + SequenceNumber earliest_mem_seqno) { + LevelCompactionBuilder builder(cf_name, vstorage, earliest_mem_seqno, this, + log_buffer, mutable_cf_options, ioptions_); + return builder.PickCompaction(); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_picker_level.h b/src/rocksdb/db/compaction/compaction_picker_level.h new file mode 100644 index 000000000..b82070e14 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_picker_level.h @@ -0,0 +1,32 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "db/compaction/compaction_picker.h" + +namespace ROCKSDB_NAMESPACE { +// Picking compactions for leveled compaction. See wiki page +// https://github.com/facebook/rocksdb/wiki/Leveled-Compaction +// for description of Leveled compaction. +class LevelCompactionPicker : public CompactionPicker { + public: + LevelCompactionPicker(const ImmutableCFOptions& ioptions, + const InternalKeyComparator* icmp) + : CompactionPicker(ioptions, icmp) {} + virtual Compaction* PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, LogBuffer* log_buffer, + SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override; + + virtual bool NeedsCompaction( + const VersionStorageInfo* vstorage) const override; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/compaction/compaction_picker_test.cc b/src/rocksdb/db/compaction/compaction_picker_test.cc new file mode 100644 index 000000000..278bdb06a --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_picker_test.cc @@ -0,0 +1,1741 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + + +#include +#include +#include +#include "db/compaction/compaction.h" +#include "db/compaction/compaction_picker_fifo.h" +#include "db/compaction/compaction_picker_level.h" +#include "db/compaction/compaction_picker_universal.h" + +#include "logging/logging.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +class CountingLogger : public Logger { + public: + using Logger::Logv; + void Logv(const char* /*format*/, va_list /*ap*/) override { log_count++; } + size_t log_count; +}; + +class CompactionPickerTest : public testing::Test { + public: + const Comparator* ucmp_; + InternalKeyComparator icmp_; + Options options_; + ImmutableCFOptions ioptions_; + MutableCFOptions mutable_cf_options_; + LevelCompactionPicker level_compaction_picker; + std::string cf_name_; + CountingLogger logger_; + LogBuffer log_buffer_; + uint32_t file_num_; + CompactionOptionsFIFO fifo_options_; + std::unique_ptr vstorage_; + std::vector> files_; + // does not own FileMetaData + std::unordered_map> file_map_; + // input files to compaction process. + std::vector input_files_; + int compaction_level_start_; + + CompactionPickerTest() + : ucmp_(BytewiseComparator()), + icmp_(ucmp_), + ioptions_(options_), + mutable_cf_options_(options_), + level_compaction_picker(ioptions_, &icmp_), + cf_name_("dummy"), + log_buffer_(InfoLogLevel::INFO_LEVEL, &logger_), + file_num_(1), + vstorage_(nullptr) { + mutable_cf_options_.ttl = 0; + mutable_cf_options_.periodic_compaction_seconds = 0; + // ioptions_.compaction_pri = kMinOverlappingRatio has its own set of + // tests to cover. + ioptions_.compaction_pri = kByCompensatedSize; + fifo_options_.max_table_files_size = 1; + mutable_cf_options_.RefreshDerivedOptions(ioptions_); + ioptions_.cf_paths.emplace_back("dummy", + std::numeric_limits::max()); + } + + ~CompactionPickerTest() override {} + + void NewVersionStorage(int num_levels, CompactionStyle style) { + DeleteVersionStorage(); + options_.num_levels = num_levels; + vstorage_.reset(new VersionStorageInfo(&icmp_, ucmp_, options_.num_levels, + style, nullptr, false)); + vstorage_->CalculateBaseBytes(ioptions_, mutable_cf_options_); + } + + void DeleteVersionStorage() { + vstorage_.reset(); + files_.clear(); + file_map_.clear(); + input_files_.clear(); + } + + void Add(int level, uint32_t file_number, const char* smallest, + const char* largest, uint64_t file_size = 1, uint32_t path_id = 0, + SequenceNumber smallest_seq = 100, SequenceNumber largest_seq = 100, + size_t compensated_file_size = 0) { + assert(level < vstorage_->num_levels()); + FileMetaData* f = new FileMetaData( + file_number, path_id, file_size, + InternalKey(smallest, smallest_seq, kTypeValue), + InternalKey(largest, largest_seq, kTypeValue), smallest_seq, + largest_seq, /* marked_for_compact */ false, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName); + f->compensated_file_size = + (compensated_file_size != 0) ? compensated_file_size : file_size; + vstorage_->AddFile(level, f); + files_.emplace_back(f); + file_map_.insert({file_number, {f, level}}); + } + + void SetCompactionInputFilesLevels(int level_count, int start_level) { + input_files_.resize(level_count); + for (int i = 0; i < level_count; ++i) { + input_files_[i].level = start_level + i; + } + compaction_level_start_ = start_level; + } + + void AddToCompactionFiles(uint32_t file_number) { + auto iter = file_map_.find(file_number); + assert(iter != file_map_.end()); + int level = iter->second.second; + assert(level < vstorage_->num_levels()); + input_files_[level - compaction_level_start_].files.emplace_back( + iter->second.first); + } + + void UpdateVersionStorageInfo() { + vstorage_->CalculateBaseBytes(ioptions_, mutable_cf_options_); + vstorage_->UpdateFilesByCompactionPri(ioptions_.compaction_pri); + vstorage_->UpdateNumNonEmptyLevels(); + vstorage_->GenerateFileIndexer(); + vstorage_->GenerateLevelFilesBrief(); + vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_); + vstorage_->GenerateLevel0NonOverlapping(); + vstorage_->ComputeFilesMarkedForCompaction(); + vstorage_->SetFinalized(); + } +}; + +TEST_F(CompactionPickerTest, Empty) { + NewVersionStorage(6, kCompactionStyleLevel); + UpdateVersionStorageInfo(); + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() == nullptr); +} + +TEST_F(CompactionPickerTest, Single) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + Add(0, 1U, "p", "q"); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() == nullptr); +} + +TEST_F(CompactionPickerTest, Level0Trigger) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + Add(0, 1U, "150", "200"); + Add(0, 2U, "200", "250"); + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, Level1Trigger) { + NewVersionStorage(6, kCompactionStyleLevel); + Add(1, 66U, "150", "200", 1000000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, Level1Trigger2) { + mutable_cf_options_.target_file_size_base = 10000000000; + mutable_cf_options_.RefreshDerivedOptions(ioptions_); + NewVersionStorage(6, kCompactionStyleLevel); + Add(1, 66U, "150", "200", 1000000001U); + Add(1, 88U, "201", "300", 1000000000U); + Add(2, 6U, "150", "179", 1000000000U); + Add(2, 7U, "180", "220", 1000000000U); + Add(2, 8U, "221", "300", 1000000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(2U, compaction->num_input_files(1)); + ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber()); + ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber()); + ASSERT_EQ(uint64_t{1073741824}, compaction->OutputFilePreallocationSize()); +} + +TEST_F(CompactionPickerTest, LevelMaxScore) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.target_file_size_base = 10000000; + mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024; + mutable_cf_options_.RefreshDerivedOptions(ioptions_); + Add(0, 1U, "150", "200", 1000000U); + // Level 1 score 1.2 + Add(1, 66U, "150", "200", 6000000U); + Add(1, 88U, "201", "300", 6000000U); + // Level 2 score 1.8. File 7 is the largest. Should be picked + Add(2, 6U, "150", "179", 60000000U); + Add(2, 7U, "180", "220", 60000001U); + Add(2, 8U, "221", "300", 60000000U); + // Level 3 score slightly larger than 1 + Add(3, 26U, "150", "170", 260000000U); + Add(3, 27U, "171", "179", 260000000U); + Add(3, 28U, "191", "220", 260000000U); + Add(3, 29U, "221", "300", 260000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(mutable_cf_options_.target_file_size_base + + mutable_cf_options_.target_file_size_base / 10, + compaction->OutputFilePreallocationSize()); +} + +TEST_F(CompactionPickerTest, NeedsCompactionLevel) { + const int kLevels = 6; + const int kFileCount = 20; + + for (int level = 0; level < kLevels - 1; ++level) { + NewVersionStorage(kLevels, kCompactionStyleLevel); + uint64_t file_size = vstorage_->MaxBytesForLevel(level) * 2 / kFileCount; + for (int file_count = 1; file_count <= kFileCount; ++file_count) { + // start a brand new version in each test. + NewVersionStorage(kLevels, kCompactionStyleLevel); + for (int i = 0; i < file_count; ++i) { + Add(level, i, ToString((i + 100) * 1000).c_str(), + ToString((i + 100) * 1000 + 999).c_str(), + file_size, 0, i * 100, i * 100 + 99); + } + UpdateVersionStorageInfo(); + ASSERT_EQ(vstorage_->CompactionScoreLevel(0), level); + ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()), + vstorage_->CompactionScore(0) >= 1); + // release the version storage + DeleteVersionStorage(); + } + } +} + +TEST_F(CompactionPickerTest, Level0TriggerDynamic) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 200; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(0, 1U, "150", "200"); + Add(0, 2U, "200", "250"); + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(1, static_cast(compaction->num_input_levels())); + ASSERT_EQ(num_levels - 1, compaction->output_level()); +} + +TEST_F(CompactionPickerTest, Level0TriggerDynamic2) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 200; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(0, 1U, "150", "200"); + Add(0, 2U, "200", "250"); + Add(num_levels - 1, 3U, "200", "250", 300U); + + UpdateVersionStorageInfo(); + ASSERT_EQ(vstorage_->base_level(), num_levels - 2); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(1, static_cast(compaction->num_input_levels())); + ASSERT_EQ(num_levels - 2, compaction->output_level()); +} + +TEST_F(CompactionPickerTest, Level0TriggerDynamic3) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 200; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(0, 1U, "150", "200"); + Add(0, 2U, "200", "250"); + Add(num_levels - 1, 3U, "200", "250", 300U); + Add(num_levels - 1, 4U, "300", "350", 3000U); + + UpdateVersionStorageInfo(); + ASSERT_EQ(vstorage_->base_level(), num_levels - 3); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(1, static_cast(compaction->num_input_levels())); + ASSERT_EQ(num_levels - 3, compaction->output_level()); +} + +TEST_F(CompactionPickerTest, Level0TriggerDynamic4) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 200; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(0, 1U, "150", "200"); + Add(0, 2U, "200", "250"); + Add(num_levels - 1, 3U, "200", "250", 300U); + Add(num_levels - 1, 4U, "300", "350", 3000U); + Add(num_levels - 3, 5U, "150", "180", 3U); + Add(num_levels - 3, 6U, "181", "300", 3U); + Add(num_levels - 3, 7U, "400", "450", 3U); + + UpdateVersionStorageInfo(); + ASSERT_EQ(vstorage_->base_level(), num_levels - 3); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->num_input_files(1)); + ASSERT_EQ(num_levels - 3, compaction->level(1)); + ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber()); + ASSERT_EQ(6U, compaction->input(1, 1)->fd.GetNumber()); + ASSERT_EQ(2, static_cast(compaction->num_input_levels())); + ASSERT_EQ(num_levels - 3, compaction->output_level()); +} + +TEST_F(CompactionPickerTest, LevelTriggerDynamic4) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = true; + ioptions_.compaction_pri = kMinOverlappingRatio; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 200; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(0, 1U, "150", "200"); + Add(num_levels - 1, 3U, "200", "250", 300U); + Add(num_levels - 1, 4U, "300", "350", 3000U); + Add(num_levels - 1, 4U, "400", "450", 3U); + Add(num_levels - 2, 5U, "150", "180", 300U); + Add(num_levels - 2, 6U, "181", "350", 500U); + Add(num_levels - 2, 7U, "400", "450", 200U); + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(0, compaction->num_input_files(1)); + ASSERT_EQ(1U, compaction->num_input_levels()); + ASSERT_EQ(num_levels - 1, compaction->output_level()); +} + +// Universal and FIFO Compactions are not supported in ROCKSDB_LITE +#ifndef ROCKSDB_LITE +TEST_F(CompactionPickerTest, NeedsCompactionUniversal) { + NewVersionStorage(1, kCompactionStyleUniversal); + UniversalCompactionPicker universal_compaction_picker( + ioptions_, &icmp_); + UpdateVersionStorageInfo(); + // must return false when there's no files. + ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()), + false); + + // verify the trigger given different number of L0 files. + for (int i = 1; + i <= mutable_cf_options_.level0_file_num_compaction_trigger * 2; ++i) { + NewVersionStorage(1, kCompactionStyleUniversal); + Add(0, i, ToString((i + 100) * 1000).c_str(), + ToString((i + 100) * 1000 + 999).c_str(), 1000000, 0, i * 100, + i * 100 + 99); + UpdateVersionStorageInfo(); + ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()), + vstorage_->CompactionScore(0) >= 1); + } +} + +TEST_F(CompactionPickerTest, CompactionUniversalIngestBehindReservedLevel) { + const uint64_t kFileSize = 100000; + NewVersionStorage(1, kCompactionStyleUniversal); + ioptions_.allow_ingest_behind = true; + ioptions_.num_levels = 3; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + UpdateVersionStorageInfo(); + // must return false when there's no files. + ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()), + false); + + NewVersionStorage(3, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(0, 2U, "201", "250", kFileSize, 0, 401, 450); + Add(0, 4U, "260", "300", kFileSize, 0, 260, 300); + Add(1, 5U, "100", "151", kFileSize, 0, 200, 251); + Add(1, 3U, "301", "350", kFileSize, 0, 101, 150); + Add(2, 6U, "120", "200", kFileSize, 0, 20, 100); + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + + // output level should be the one above the bottom-most + ASSERT_EQ(1, compaction->output_level()); +} +// Tests if the files can be trivially moved in multi level +// universal compaction when allow_trivial_move option is set +// In this test as the input files overlaps, they cannot +// be trivially moved. + +TEST_F(CompactionPickerTest, CannotTrivialMoveUniversal) { + const uint64_t kFileSize = 100000; + + mutable_cf_options_.compaction_options_universal.allow_trivial_move = true; + NewVersionStorage(1, kCompactionStyleUniversal); + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + UpdateVersionStorageInfo(); + // must return false when there's no files. + ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()), + false); + + NewVersionStorage(3, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(0, 2U, "201", "250", kFileSize, 0, 401, 450); + Add(0, 4U, "260", "300", kFileSize, 0, 260, 300); + Add(1, 5U, "100", "151", kFileSize, 0, 200, 251); + Add(1, 3U, "301", "350", kFileSize, 0, 101, 150); + Add(2, 6U, "120", "200", kFileSize, 0, 20, 100); + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + + ASSERT_TRUE(!compaction->is_trivial_move()); +} +// Tests if the files can be trivially moved in multi level +// universal compaction when allow_trivial_move option is set +// In this test as the input files doesn't overlaps, they should +// be trivially moved. +TEST_F(CompactionPickerTest, AllowsTrivialMoveUniversal) { + const uint64_t kFileSize = 100000; + + mutable_cf_options_.compaction_options_universal.allow_trivial_move = true; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(3, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(0, 2U, "201", "250", kFileSize, 0, 401, 450); + Add(0, 4U, "260", "300", kFileSize, 0, 260, 300); + Add(1, 5U, "010", "080", kFileSize, 0, 200, 251); + Add(2, 3U, "301", "350", kFileSize, 0, 101, 150); + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + + ASSERT_TRUE(compaction->is_trivial_move()); +} + +TEST_F(CompactionPickerTest, UniversalPeriodicCompaction1) { + // The case where universal periodic compaction can be picked + // with some newer files being compacted. + const uint64_t kFileSize = 100000; + + mutable_cf_options_.periodic_compaction_seconds = 1000; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(0, 2U, "201", "250", kFileSize, 0, 401, 450); + Add(0, 4U, "260", "300", kFileSize, 0, 260, 300); + Add(3, 5U, "010", "080", kFileSize, 0, 200, 251); + Add(4, 3U, "301", "350", kFileSize, 0, 101, 150); + Add(4, 6U, "501", "750", kFileSize, 0, 101, 150); + + file_map_[2].first->being_compacted = true; + UpdateVersionStorageInfo(); + vstorage_->TEST_AddFileMarkedForPeriodicCompaction(4, file_map_[3].first); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + + ASSERT_TRUE(compaction); + ASSERT_EQ(4, compaction->output_level()); + ASSERT_EQ(0, compaction->start_level()); + ASSERT_EQ(1U, compaction->num_input_files(0)); +} + +TEST_F(CompactionPickerTest, UniversalPeriodicCompaction2) { + // The case where universal periodic compaction does not + // pick up only level to compact if it doesn't cover + // any file marked as periodic compaction. + const uint64_t kFileSize = 100000; + + mutable_cf_options_.periodic_compaction_seconds = 1000; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(3, 5U, "010", "080", kFileSize, 0, 200, 251); + Add(4, 3U, "301", "350", kFileSize, 0, 101, 150); + Add(4, 6U, "501", "750", kFileSize, 0, 101, 150); + + file_map_[5].first->being_compacted = true; + UpdateVersionStorageInfo(); + vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[1].first); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + + ASSERT_FALSE(compaction); +} + +TEST_F(CompactionPickerTest, UniversalPeriodicCompaction3) { + // The case where universal periodic compaction does not + // pick up only the last sorted run which is an L0 file if it isn't + // marked as periodic compaction. + const uint64_t kFileSize = 100000; + + mutable_cf_options_.periodic_compaction_seconds = 1000; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(0, 5U, "010", "080", kFileSize, 0, 200, 251); + Add(0, 6U, "501", "750", kFileSize, 0, 101, 150); + + file_map_[5].first->being_compacted = true; + UpdateVersionStorageInfo(); + vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[1].first); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + + ASSERT_FALSE(compaction); +} + +TEST_F(CompactionPickerTest, UniversalPeriodicCompaction4) { + // The case where universal periodic compaction couldn't form + // a compaction that inlcudes any file marked for periodic compaction. + // Right now we form the compaction anyway if it is more than one + // sorted run. Just put the case here to validate that it doesn't + // crash. + const uint64_t kFileSize = 100000; + + mutable_cf_options_.periodic_compaction_seconds = 1000; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(2, 2U, "010", "080", kFileSize, 0, 200, 251); + Add(3, 5U, "010", "080", kFileSize, 0, 200, 251); + Add(4, 3U, "301", "350", kFileSize, 0, 101, 150); + Add(4, 6U, "501", "750", kFileSize, 0, 101, 150); + + file_map_[2].first->being_compacted = true; + UpdateVersionStorageInfo(); + vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[2].first); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(!compaction || + compaction->start_level() != compaction->output_level()); +} + +TEST_F(CompactionPickerTest, UniversalPeriodicCompaction5) { + // Test single L0 file periodic compaction triggering. + const uint64_t kFileSize = 100000; + + mutable_cf_options_.periodic_compaction_seconds = 1000; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 6U, "150", "200", kFileSize, 0, 500, 550); + UpdateVersionStorageInfo(); + vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[6].first); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction); + ASSERT_EQ(0, compaction->start_level()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(6U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(4, compaction->output_level()); +} + +TEST_F(CompactionPickerTest, UniversalPeriodicCompaction6) { + // Test single sorted run non-L0 periodic compaction + const uint64_t kFileSize = 100000; + + mutable_cf_options_.periodic_compaction_seconds = 1000; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(4, 5U, "150", "200", kFileSize, 0, 500, 550); + Add(4, 6U, "350", "400", kFileSize, 0, 500, 550); + UpdateVersionStorageInfo(); + vstorage_->TEST_AddFileMarkedForPeriodicCompaction(4, file_map_[6].first); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction); + ASSERT_EQ(4, compaction->start_level()); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(6U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(4, compaction->output_level()); +} + +TEST_F(CompactionPickerTest, NeedsCompactionFIFO) { + NewVersionStorage(1, kCompactionStyleFIFO); + const int kFileCount = + mutable_cf_options_.level0_file_num_compaction_trigger * 3; + const uint64_t kFileSize = 100000; + const uint64_t kMaxSize = kFileSize * kFileCount / 2; + + fifo_options_.max_table_files_size = kMaxSize; + mutable_cf_options_.compaction_options_fifo = fifo_options_; + FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_); + UpdateVersionStorageInfo(); + // must return false when there's no files. + ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), false); + + // verify whether compaction is needed based on the current + // size of L0 files. + uint64_t current_size = 0; + for (int i = 1; i <= kFileCount; ++i) { + NewVersionStorage(1, kCompactionStyleFIFO); + Add(0, i, ToString((i + 100) * 1000).c_str(), + ToString((i + 100) * 1000 + 999).c_str(), + kFileSize, 0, i * 100, i * 100 + 99); + current_size += kFileSize; + UpdateVersionStorageInfo(); + ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), + vstorage_->CompactionScore(0) >= 1); + } +} +#endif // ROCKSDB_LITE + +TEST_F(CompactionPickerTest, CompactionPriMinOverlapping1) { + NewVersionStorage(6, kCompactionStyleLevel); + ioptions_.compaction_pri = kMinOverlappingRatio; + mutable_cf_options_.target_file_size_base = 100000000000; + mutable_cf_options_.target_file_size_multiplier = 10; + mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024; + mutable_cf_options_.RefreshDerivedOptions(ioptions_); + + Add(2, 6U, "150", "179", 50000000U); + Add(2, 7U, "180", "220", 50000000U); + Add(2, 8U, "321", "400", 50000000U); // File not overlapping + Add(2, 9U, "721", "800", 50000000U); + + Add(3, 26U, "150", "170", 260000000U); + Add(3, 27U, "171", "179", 260000000U); + Add(3, 28U, "191", "220", 260000000U); + Add(3, 29U, "221", "300", 260000000U); + Add(3, 30U, "750", "900", 260000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + // Pick file 8 because it overlaps with 0 files on level 3. + ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber()); + // Compaction input size * 1.1 + ASSERT_GE(uint64_t{55000000}, compaction->OutputFilePreallocationSize()); +} + +TEST_F(CompactionPickerTest, CompactionPriMinOverlapping2) { + NewVersionStorage(6, kCompactionStyleLevel); + ioptions_.compaction_pri = kMinOverlappingRatio; + mutable_cf_options_.target_file_size_base = 10000000; + mutable_cf_options_.target_file_size_multiplier = 10; + mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024; + + Add(2, 6U, "150", "175", + 60000000U); // Overlaps with file 26, 27, total size 521M + Add(2, 7U, "176", "200", 60000000U); // Overlaps with file 27, 28, total size + // 520M, the smalelst overlapping + Add(2, 8U, "201", "300", + 60000000U); // Overlaps with file 28, 29, total size 521M + + Add(3, 26U, "100", "110", 261000000U); + Add(3, 26U, "150", "170", 261000000U); + Add(3, 27U, "171", "179", 260000000U); + Add(3, 28U, "191", "220", 260000000U); + Add(3, 29U, "221", "300", 261000000U); + Add(3, 30U, "321", "400", 261000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + // Picking file 7 because overlapping ratio is the biggest. + ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, CompactionPriMinOverlapping3) { + NewVersionStorage(6, kCompactionStyleLevel); + ioptions_.compaction_pri = kMinOverlappingRatio; + mutable_cf_options_.max_bytes_for_level_base = 10000000; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + + // file 7 and 8 over lap with the same file, but file 8 is smaller so + // it will be picked. + Add(2, 6U, "150", "167", 60000000U); // Overlaps with file 26, 27 + Add(2, 7U, "168", "169", 60000000U); // Overlaps with file 27 + Add(2, 8U, "201", "300", 61000000U); // Overlaps with file 28, but the file + // itself is larger. Should be picked. + + Add(3, 26U, "160", "165", 260000000U); + Add(3, 27U, "166", "170", 260000000U); + Add(3, 28U, "180", "400", 260000000U); + Add(3, 29U, "401", "500", 260000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + // Picking file 8 because overlapping ratio is the biggest. + ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, CompactionPriMinOverlapping4) { + NewVersionStorage(6, kCompactionStyleLevel); + ioptions_.compaction_pri = kMinOverlappingRatio; + mutable_cf_options_.max_bytes_for_level_base = 10000000; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + + // file 7 and 8 over lap with the same file, but file 8 is smaller so + // it will be picked. + // Overlaps with file 26, 27. And the file is compensated so will be + // picked up. + Add(2, 6U, "150", "167", 60000000U, 0, 100, 100, 180000000U); + Add(2, 7U, "168", "169", 60000000U); // Overlaps with file 27 + Add(2, 8U, "201", "300", 61000000U); // Overlaps with file 28 + + Add(3, 26U, "160", "165", 60000000U); + // Boosted file size in output level is not considered. + Add(3, 27U, "166", "170", 60000000U, 0, 100, 100, 260000000U); + Add(3, 28U, "180", "400", 60000000U); + Add(3, 29U, "401", "500", 60000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + // Picking file 8 because overlapping ratio is the biggest. + ASSERT_EQ(6U, compaction->input(0, 0)->fd.GetNumber()); +} + +// This test exhibits the bug where we don't properly reset parent_index in +// PickCompaction() +TEST_F(CompactionPickerTest, ParentIndexResetBug) { + int num_levels = ioptions_.num_levels; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 200; + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(0, 1U, "150", "200"); // <- marked for compaction + Add(1, 3U, "400", "500", 600); // <- this one needs compacting + Add(2, 4U, "150", "200"); + Add(2, 5U, "201", "210"); + Add(2, 6U, "300", "310"); + Add(2, 7U, "400", "500"); // <- being compacted + + vstorage_->LevelFiles(2)[3]->being_compacted = true; + vstorage_->LevelFiles(0)[0]->marked_for_compaction = true; + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); +} + +// This test checks ExpandWhileOverlapping() by having overlapping user keys +// ranges (with different sequence numbers) in the input files. +TEST_F(CompactionPickerTest, OverlappingUserKeys) { + NewVersionStorage(6, kCompactionStyleLevel); + ioptions_.compaction_pri = kByCompensatedSize; + + Add(1, 1U, "100", "150", 1U); + // Overlapping user keys + Add(1, 2U, "200", "400", 1U); + Add(1, 3U, "400", "500", 1000000000U, 0, 0); + Add(2, 4U, "600", "700", 1U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_levels()); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys2) { + NewVersionStorage(6, kCompactionStyleLevel); + // Overlapping user keys on same level and output level + Add(1, 1U, "200", "400", 1000000000U); + Add(1, 2U, "400", "500", 1U, 0, 0); + Add(2, 3U, "000", "100", 1U); + Add(2, 4U, "100", "600", 1U, 0, 0); + Add(2, 5U, "600", "700", 1U, 0, 0); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(3U, compaction->num_input_files(1)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(3U, compaction->input(1, 0)->fd.GetNumber()); + ASSERT_EQ(4U, compaction->input(1, 1)->fd.GetNumber()); + ASSERT_EQ(5U, compaction->input(1, 2)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys3) { + NewVersionStorage(6, kCompactionStyleLevel); + // Chain of overlapping user key ranges (forces ExpandWhileOverlapping() to + // expand multiple times) + Add(1, 1U, "100", "150", 1U); + Add(1, 2U, "150", "200", 1U, 0, 0); + Add(1, 3U, "200", "250", 1000000000U, 0, 0); + Add(1, 4U, "250", "300", 1U, 0, 0); + Add(1, 5U, "300", "350", 1U, 0, 0); + // Output level overlaps with the beginning and the end of the chain + Add(2, 6U, "050", "100", 1U); + Add(2, 7U, "350", "400", 1U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(5U, compaction->num_input_files(0)); + ASSERT_EQ(2U, compaction->num_input_files(1)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(3U, compaction->input(0, 2)->fd.GetNumber()); + ASSERT_EQ(4U, compaction->input(0, 3)->fd.GetNumber()); + ASSERT_EQ(5U, compaction->input(0, 4)->fd.GetNumber()); + ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber()); + ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys4) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.max_bytes_for_level_base = 1000000; + + Add(1, 1U, "100", "150", 1U); + Add(1, 2U, "150", "199", 1U, 0, 0); + Add(1, 3U, "200", "250", 1100000U, 0, 0); + Add(1, 4U, "251", "300", 1U, 0, 0); + Add(1, 5U, "300", "350", 1U, 0, 0); + + Add(2, 6U, "100", "115", 1U); + Add(2, 7U, "125", "325", 1U); + Add(2, 8U, "350", "400", 1U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->num_input_files(1)); + ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys5) { + NewVersionStorage(6, kCompactionStyleLevel); + // Overlapping user keys on same level and output level + Add(1, 1U, "200", "400", 1000000000U); + Add(1, 2U, "400", "500", 1U, 0, 0); + Add(2, 3U, "000", "100", 1U); + Add(2, 4U, "100", "600", 1U, 0, 0); + Add(2, 5U, "600", "700", 1U, 0, 0); + + vstorage_->LevelFiles(2)[2]->being_compacted = true; + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() == nullptr); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys6) { + NewVersionStorage(6, kCompactionStyleLevel); + // Overlapping user keys on same level and output level + Add(1, 1U, "200", "400", 1U, 0, 0); + Add(1, 2U, "401", "500", 1U, 0, 0); + Add(2, 3U, "000", "100", 1U); + Add(2, 4U, "100", "300", 1U, 0, 0); + Add(2, 5U, "305", "450", 1U, 0, 0); + Add(2, 6U, "460", "600", 1U, 0, 0); + Add(2, 7U, "600", "700", 1U, 0, 0); + + vstorage_->LevelFiles(1)[0]->marked_for_compaction = true; + vstorage_->LevelFiles(1)[1]->marked_for_compaction = true; + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(3U, compaction->num_input_files(1)); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys7) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.max_compaction_bytes = 100000000000u; + // Overlapping user keys on same level and output level + Add(1, 1U, "200", "400", 1U, 0, 0); + Add(1, 2U, "401", "500", 1000000000U, 0, 0); + Add(2, 3U, "100", "250", 1U); + Add(2, 4U, "300", "600", 1U, 0, 0); + Add(2, 5U, "600", "800", 1U, 0, 0); + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_GE(1U, compaction->num_input_files(0)); + ASSERT_GE(2U, compaction->num_input_files(1)); + // File 5 has to be included in the compaction + ASSERT_EQ(5U, compaction->inputs(1)->back()->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys8) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.max_compaction_bytes = 100000000000u; + // grow the number of inputs in "level" without + // changing the number of "level+1" files we pick up + // Expand input level as much as possible + // no overlapping case + Add(1, 1U, "101", "150", 1U); + Add(1, 2U, "151", "200", 1U); + Add(1, 3U, "201", "300", 1000000000U); + Add(1, 4U, "301", "400", 1U); + Add(1, 5U, "401", "500", 1U); + Add(2, 6U, "150", "200", 1U); + Add(2, 7U, "200", "450", 1U, 0, 0); + Add(2, 8U, "500", "600", 1U); + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(3U, compaction->num_input_files(0)); + ASSERT_EQ(2U, compaction->num_input_files(1)); + ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(4U, compaction->input(0, 2)->fd.GetNumber()); + ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber()); + ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys9) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.max_compaction_bytes = 100000000000u; + // grow the number of inputs in "level" without + // changing the number of "level+1" files we pick up + // Expand input level as much as possible + // overlapping case + Add(1, 1U, "121", "150", 1U); + Add(1, 2U, "151", "200", 1U); + Add(1, 3U, "201", "300", 1000000000U); + Add(1, 4U, "301", "400", 1U); + Add(1, 5U, "401", "500", 1U); + Add(2, 6U, "100", "120", 1U); + Add(2, 7U, "150", "200", 1U); + Add(2, 8U, "200", "450", 1U, 0, 0); + Add(2, 9U, "501", "600", 1U); + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(5U, compaction->num_input_files(0)); + ASSERT_EQ(2U, compaction->num_input_files(1)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(3U, compaction->input(0, 2)->fd.GetNumber()); + ASSERT_EQ(4U, compaction->input(0, 3)->fd.GetNumber()); + ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber()); + ASSERT_EQ(8U, compaction->input(1, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys10) { + // Locked file encountered when pulling in extra input-level files with same + // user keys. Verify we pick the next-best file from the same input level. + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.max_compaction_bytes = 100000000000u; + + // file_number 2U is largest and thus first choice. But it overlaps with + // file_number 1U which is being compacted. So instead we pick the next- + // biggest file, 3U, which is eligible for compaction. + Add(1 /* level */, 1U /* file_number */, "100" /* smallest */, + "150" /* largest */, 1U /* file_size */); + file_map_[1U].first->being_compacted = true; + Add(1 /* level */, 2U /* file_number */, "150" /* smallest */, + "200" /* largest */, 1000000000U /* file_size */, 0 /* smallest_seq */, + 0 /* largest_seq */); + Add(1 /* level */, 3U /* file_number */, "201" /* smallest */, + "250" /* largest */, 900000000U /* file_size */); + Add(2 /* level */, 4U /* file_number */, "100" /* smallest */, + "150" /* largest */, 1U /* file_size */); + Add(2 /* level */, 5U /* file_number */, "151" /* smallest */, + "200" /* largest */, 1U /* file_size */); + Add(2 /* level */, 6U /* file_number */, "201" /* smallest */, + "250" /* largest */, 1U /* file_size */); + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->num_input_files(1)); + ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, OverlappingUserKeys11) { + // Locked file encountered when pulling in extra output-level files with same + // user keys. Expected to skip that compaction and pick the next-best choice. + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.max_compaction_bytes = 100000000000u; + + // score(L1) = 3.7 + // score(L2) = 1.85 + // There is no eligible file in L1 to compact since both candidates pull in + // file_number 5U, which overlaps with a file pending compaction (6U). The + // first eligible compaction is from L2->L3. + Add(1 /* level */, 2U /* file_number */, "151" /* smallest */, + "200" /* largest */, 1000000000U /* file_size */); + Add(1 /* level */, 3U /* file_number */, "201" /* smallest */, + "250" /* largest */, 1U /* file_size */); + Add(2 /* level */, 4U /* file_number */, "100" /* smallest */, + "149" /* largest */, 5000000000U /* file_size */); + Add(2 /* level */, 5U /* file_number */, "150" /* smallest */, + "201" /* largest */, 1U /* file_size */); + Add(2 /* level */, 6U /* file_number */, "201" /* smallest */, + "249" /* largest */, 1U /* file_size */, 0 /* smallest_seq */, + 0 /* largest_seq */); + file_map_[6U].first->being_compacted = true; + Add(3 /* level */, 7U /* file_number */, "100" /* smallest */, + "149" /* largest */, 1U /* file_size */); + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->num_input_files(1)); + ASSERT_EQ(4U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri1) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 900000000U; + + // 6 L0 files, score 3. + Add(0, 1U, "000", "400", 1U); + Add(0, 2U, "001", "400", 1U, 0, 0); + Add(0, 3U, "001", "400", 1000000000U, 0, 0); + Add(0, 31U, "001", "400", 1000000000U, 0, 0); + Add(0, 32U, "001", "400", 1000000000U, 0, 0); + Add(0, 33U, "001", "400", 1000000000U, 0, 0); + + // L1 total size 2GB, score 2.2. If one file being comapcted, score 1.1. + Add(1, 4U, "050", "300", 1000000000U, 0, 0); + file_map_[4u].first->being_compacted = true; + Add(1, 5U, "301", "350", 1000000000U, 0, 0); + + // Output level overlaps with the beginning and the end of the chain + Add(2, 6U, "050", "100", 1U); + Add(2, 7U, "300", "400", 1U); + + // No compaction should be scheduled, if L0 has higher priority than L1 + // but L0->L1 compaction is blocked by a file in L1 being compacted. + UpdateVersionStorageInfo(); + ASSERT_EQ(0, vstorage_->CompactionScoreLevel(0)); + ASSERT_EQ(1, vstorage_->CompactionScoreLevel(1)); + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() == nullptr); +} + +TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri2) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 900000000U; + + // 6 L0 files, score 3. + Add(0, 1U, "000", "400", 1U); + Add(0, 2U, "001", "400", 1U, 0, 0); + Add(0, 3U, "001", "400", 1000000000U, 0, 0); + Add(0, 31U, "001", "400", 1000000000U, 0, 0); + Add(0, 32U, "001", "400", 1000000000U, 0, 0); + Add(0, 33U, "001", "400", 1000000000U, 0, 0); + + // L1 total size 2GB, score 2.2. If one file being comapcted, score 1.1. + Add(1, 4U, "050", "300", 1000000000U, 0, 0); + Add(1, 5U, "301", "350", 1000000000U, 0, 0); + + // Output level overlaps with the beginning and the end of the chain + Add(2, 6U, "050", "100", 1U); + Add(2, 7U, "300", "400", 1U); + + // If no file in L1 being compacted, L0->L1 compaction will be scheduled. + UpdateVersionStorageInfo(); // being_compacted flag is cleared here. + ASSERT_EQ(0, vstorage_->CompactionScoreLevel(0)); + ASSERT_EQ(1, vstorage_->CompactionScoreLevel(1)); + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); +} + +TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri3) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_bytes_for_level_base = 900000000U; + + // 6 L0 files, score 3. + Add(0, 1U, "000", "400", 1U); + Add(0, 2U, "001", "400", 1U, 0, 0); + Add(0, 3U, "001", "400", 1000000000U, 0, 0); + Add(0, 31U, "001", "400", 1000000000U, 0, 0); + Add(0, 32U, "001", "400", 1000000000U, 0, 0); + Add(0, 33U, "001", "400", 1000000000U, 0, 0); + + // L1 score more than 6. + Add(1, 4U, "050", "300", 1000000000U, 0, 0); + file_map_[4u].first->being_compacted = true; + Add(1, 5U, "301", "350", 1000000000U, 0, 0); + Add(1, 51U, "351", "400", 6000000000U, 0, 0); + + // Output level overlaps with the beginning and the end of the chain + Add(2, 6U, "050", "100", 1U); + Add(2, 7U, "300", "400", 1U); + + // If score in L1 is larger than L0, L1 compaction goes through despite + // there is pending L0 compaction. + UpdateVersionStorageInfo(); + ASSERT_EQ(1, vstorage_->CompactionScoreLevel(0)); + ASSERT_EQ(0, vstorage_->CompactionScoreLevel(1)); + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); +} + +TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded1) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = false; + mutable_cf_options_.level0_file_num_compaction_trigger = 4; + mutable_cf_options_.max_bytes_for_level_base = 1000; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(0, 1U, "150", "200", 200); + Add(0, 2U, "150", "200", 200); + Add(0, 3U, "150", "200", 200); + // Level 1 is over target by 200 + Add(1, 4U, "400", "500", 600); + Add(1, 5U, "600", "700", 600); + // Level 2 is less than target 10000 even added size of level 1 + // Size ratio of L2/L1 is 9600 / 1200 = 8 + Add(2, 6U, "150", "200", 2500); + Add(2, 7U, "201", "210", 2000); + Add(2, 8U, "300", "310", 2600); + Add(2, 9U, "400", "500", 2500); + // Level 3 exceeds target 100,000 of 1000 + Add(3, 10U, "400", "500", 101000); + // Level 4 exceeds target 1,000,000 by 900 after adding size from level 3 + // Size ratio L4/L3 is 9.9 + // After merge from L3, L4 size is 1000900 + Add(4, 11U, "400", "500", 999900); + Add(5, 11U, "400", "500", 8007200); + + UpdateVersionStorageInfo(); + + ASSERT_EQ(200u * 9u + 10900u + 900u * 9, + vstorage_->estimated_compaction_needed_bytes()); +} + +TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded2) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = false; + mutable_cf_options_.level0_file_num_compaction_trigger = 3; + mutable_cf_options_.max_bytes_for_level_base = 1000; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(0, 1U, "150", "200", 200); + Add(0, 2U, "150", "200", 200); + Add(0, 4U, "150", "200", 200); + Add(0, 5U, "150", "200", 200); + Add(0, 6U, "150", "200", 200); + // Level 1 size will be 1400 after merging with L0 + Add(1, 7U, "400", "500", 200); + Add(1, 8U, "600", "700", 200); + // Level 2 is less than target 10000 even added size of level 1 + Add(2, 9U, "150", "200", 9100); + // Level 3 over the target, but since level 4 is empty, we assume it will be + // a trivial move. + Add(3, 10U, "400", "500", 101000); + + UpdateVersionStorageInfo(); + + // estimated L1->L2 merge: 400 * (9100.0 / 1400.0 + 1.0) + ASSERT_EQ(1400u + 3000u, vstorage_->estimated_compaction_needed_bytes()); +} + +TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded3) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = false; + mutable_cf_options_.level0_file_num_compaction_trigger = 3; + mutable_cf_options_.max_bytes_for_level_base = 1000; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + NewVersionStorage(num_levels, kCompactionStyleLevel); + Add(0, 1U, "150", "200", 2000); + Add(0, 2U, "150", "200", 2000); + Add(0, 4U, "150", "200", 2000); + Add(0, 5U, "150", "200", 2000); + Add(0, 6U, "150", "200", 1000); + // Level 1 size will be 10000 after merging with L0 + Add(1, 7U, "400", "500", 500); + Add(1, 8U, "600", "700", 500); + + Add(2, 9U, "150", "200", 10000); + + UpdateVersionStorageInfo(); + + ASSERT_EQ(10000u + 18000u, vstorage_->estimated_compaction_needed_bytes()); +} + +TEST_F(CompactionPickerTest, EstimateCompactionBytesNeededDynamicLevel) { + int num_levels = ioptions_.num_levels; + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.level0_file_num_compaction_trigger = 3; + mutable_cf_options_.max_bytes_for_level_base = 1000; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + NewVersionStorage(num_levels, kCompactionStyleLevel); + + // Set Last level size 50000 + // num_levels - 1 target 5000 + // num_levels - 2 is base level with target 1000 (rounded up to + // max_bytes_for_level_base). + Add(num_levels - 1, 10U, "400", "500", 50000); + + Add(0, 1U, "150", "200", 200); + Add(0, 2U, "150", "200", 200); + Add(0, 4U, "150", "200", 200); + Add(0, 5U, "150", "200", 200); + Add(0, 6U, "150", "200", 200); + // num_levels - 3 is over target by 100 + 1000 + Add(num_levels - 3, 7U, "400", "500", 550); + Add(num_levels - 3, 8U, "600", "700", 550); + // num_levels - 2 is over target by 1100 + 200 + Add(num_levels - 2, 9U, "150", "200", 5200); + + UpdateVersionStorageInfo(); + + // Merging to the second last level: (5200 / 2100 + 1) * 1100 + // Merging to the last level: (50000 / 6300 + 1) * 1300 + ASSERT_EQ(2100u + 3823u + 11617u, + vstorage_->estimated_compaction_needed_bytes()); +} + +TEST_F(CompactionPickerTest, IsBottommostLevelTest) { + // case 1: Higher levels are empty + NewVersionStorage(6, kCompactionStyleLevel); + Add(0, 1U, "a", "m"); + Add(0, 2U, "c", "z"); + Add(1, 3U, "d", "e"); + Add(1, 4U, "l", "p"); + Add(2, 5U, "g", "i"); + Add(2, 6U, "x", "z"); + UpdateVersionStorageInfo(); + SetCompactionInputFilesLevels(2, 1); + AddToCompactionFiles(3U); + AddToCompactionFiles(5U); + bool result = + Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_); + ASSERT_TRUE(result); + + // case 2: Higher levels have no overlap + NewVersionStorage(6, kCompactionStyleLevel); + Add(0, 1U, "a", "m"); + Add(0, 2U, "c", "z"); + Add(1, 3U, "d", "e"); + Add(1, 4U, "l", "p"); + Add(2, 5U, "g", "i"); + Add(2, 6U, "x", "z"); + Add(3, 7U, "k", "p"); + Add(3, 8U, "t", "w"); + Add(4, 9U, "a", "b"); + Add(5, 10U, "c", "cc"); + UpdateVersionStorageInfo(); + SetCompactionInputFilesLevels(2, 1); + AddToCompactionFiles(3U); + AddToCompactionFiles(5U); + result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_); + ASSERT_TRUE(result); + + // case 3.1: Higher levels (level 3) have overlap + NewVersionStorage(6, kCompactionStyleLevel); + Add(0, 1U, "a", "m"); + Add(0, 2U, "c", "z"); + Add(1, 3U, "d", "e"); + Add(1, 4U, "l", "p"); + Add(2, 5U, "g", "i"); + Add(2, 6U, "x", "z"); + Add(3, 7U, "e", "g"); + Add(3, 8U, "h", "k"); + Add(4, 9U, "a", "b"); + Add(5, 10U, "c", "cc"); + UpdateVersionStorageInfo(); + SetCompactionInputFilesLevels(2, 1); + AddToCompactionFiles(3U); + AddToCompactionFiles(5U); + result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_); + ASSERT_FALSE(result); + + // case 3.2: Higher levels (level 5) have overlap + DeleteVersionStorage(); + NewVersionStorage(6, kCompactionStyleLevel); + Add(0, 1U, "a", "m"); + Add(0, 2U, "c", "z"); + Add(1, 3U, "d", "e"); + Add(1, 4U, "l", "p"); + Add(2, 5U, "g", "i"); + Add(2, 6U, "x", "z"); + Add(3, 7U, "j", "k"); + Add(3, 8U, "l", "m"); + Add(4, 9U, "a", "b"); + Add(5, 10U, "c", "cc"); + Add(5, 11U, "h", "k"); + Add(5, 12U, "y", "yy"); + Add(5, 13U, "z", "zz"); + UpdateVersionStorageInfo(); + SetCompactionInputFilesLevels(2, 1); + AddToCompactionFiles(3U); + AddToCompactionFiles(5U); + result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_); + ASSERT_FALSE(result); + + // case 3.3: Higher levels (level 5) have overlap, but it's only overlapping + // one key ("d") + NewVersionStorage(6, kCompactionStyleLevel); + Add(0, 1U, "a", "m"); + Add(0, 2U, "c", "z"); + Add(1, 3U, "d", "e"); + Add(1, 4U, "l", "p"); + Add(2, 5U, "g", "i"); + Add(2, 6U, "x", "z"); + Add(3, 7U, "j", "k"); + Add(3, 8U, "l", "m"); + Add(4, 9U, "a", "b"); + Add(5, 10U, "c", "cc"); + Add(5, 11U, "ccc", "d"); + Add(5, 12U, "y", "yy"); + Add(5, 13U, "z", "zz"); + UpdateVersionStorageInfo(); + SetCompactionInputFilesLevels(2, 1); + AddToCompactionFiles(3U); + AddToCompactionFiles(5U); + result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_); + ASSERT_FALSE(result); + + // Level 0 files overlap + NewVersionStorage(6, kCompactionStyleLevel); + Add(0, 1U, "s", "t"); + Add(0, 2U, "a", "m"); + Add(0, 3U, "b", "z"); + Add(0, 4U, "e", "f"); + Add(5, 10U, "y", "z"); + UpdateVersionStorageInfo(); + SetCompactionInputFilesLevels(1, 0); + AddToCompactionFiles(1U); + AddToCompactionFiles(2U); + AddToCompactionFiles(3U); + AddToCompactionFiles(4U); + result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_); + ASSERT_FALSE(result); + + // Level 0 files don't overlap + NewVersionStorage(6, kCompactionStyleLevel); + Add(0, 1U, "s", "t"); + Add(0, 2U, "a", "m"); + Add(0, 3U, "b", "k"); + Add(0, 4U, "e", "f"); + Add(5, 10U, "y", "z"); + UpdateVersionStorageInfo(); + SetCompactionInputFilesLevels(1, 0); + AddToCompactionFiles(1U); + AddToCompactionFiles(2U); + AddToCompactionFiles(3U); + AddToCompactionFiles(4U); + result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_); + ASSERT_TRUE(result); + + // Level 1 files overlap + NewVersionStorage(6, kCompactionStyleLevel); + Add(0, 1U, "s", "t"); + Add(0, 2U, "a", "m"); + Add(0, 3U, "b", "k"); + Add(0, 4U, "e", "f"); + Add(1, 5U, "a", "m"); + Add(1, 6U, "n", "o"); + Add(1, 7U, "w", "y"); + Add(5, 10U, "y", "z"); + UpdateVersionStorageInfo(); + SetCompactionInputFilesLevels(2, 0); + AddToCompactionFiles(1U); + AddToCompactionFiles(2U); + AddToCompactionFiles(3U); + AddToCompactionFiles(4U); + AddToCompactionFiles(5U); + AddToCompactionFiles(6U); + AddToCompactionFiles(7U); + result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_); + ASSERT_FALSE(result); + + DeleteVersionStorage(); +} + +TEST_F(CompactionPickerTest, MaxCompactionBytesHit) { + mutable_cf_options_.max_bytes_for_level_base = 1000000u; + mutable_cf_options_.max_compaction_bytes = 800000u; + ioptions_.level_compaction_dynamic_level_bytes = false; + NewVersionStorage(6, kCompactionStyleLevel); + // A compaction should be triggered and pick file 2 and 5. + // It can expand because adding file 1 and 3, the compaction size will + // exceed mutable_cf_options_.max_bytes_for_level_base. + Add(1, 1U, "100", "150", 300000U); + Add(1, 2U, "151", "200", 300001U, 0, 0); + Add(1, 3U, "201", "250", 300000U, 0, 0); + Add(1, 4U, "251", "300", 300000U, 0, 0); + Add(2, 5U, "100", "256", 1U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->num_input_files(1)); + ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, MaxCompactionBytesNotHit) { + mutable_cf_options_.max_bytes_for_level_base = 800000u; + mutable_cf_options_.max_compaction_bytes = 1000000u; + ioptions_.level_compaction_dynamic_level_bytes = false; + NewVersionStorage(6, kCompactionStyleLevel); + // A compaction should be triggered and pick file 2 and 5. + // and it expands to file 1 and 3 too. + Add(1, 1U, "100", "150", 300000U); + Add(1, 2U, "151", "200", 300001U, 0, 0); + Add(1, 3U, "201", "250", 300000U, 0, 0); + Add(1, 4U, "251", "300", 300000U, 0, 0); + Add(2, 5U, "000", "251", 1U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_levels()); + ASSERT_EQ(3U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->num_input_files(1)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(3U, compaction->input(0, 2)->fd.GetNumber()); + ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, IsTrivialMoveOn) { + mutable_cf_options_.max_bytes_for_level_base = 10000u; + mutable_cf_options_.max_compaction_bytes = 10001u; + ioptions_.level_compaction_dynamic_level_bytes = false; + NewVersionStorage(6, kCompactionStyleLevel); + // A compaction should be triggered and pick file 2 + Add(1, 1U, "100", "150", 3000U); + Add(1, 2U, "151", "200", 3001U); + Add(1, 3U, "201", "250", 3000U); + Add(1, 4U, "251", "300", 3000U); + + Add(3, 5U, "120", "130", 7000U); + Add(3, 6U, "170", "180", 7000U); + Add(3, 5U, "220", "230", 7000U); + Add(3, 5U, "270", "280", 7000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_TRUE(compaction->IsTrivialMove()); +} + +TEST_F(CompactionPickerTest, IsTrivialMoveOff) { + mutable_cf_options_.max_bytes_for_level_base = 1000000u; + mutable_cf_options_.max_compaction_bytes = 10000u; + ioptions_.level_compaction_dynamic_level_bytes = false; + NewVersionStorage(6, kCompactionStyleLevel); + // A compaction should be triggered and pick all files from level 1 + Add(1, 1U, "100", "150", 300000U, 0, 0); + Add(1, 2U, "150", "200", 300000U, 0, 0); + Add(1, 3U, "200", "250", 300000U, 0, 0); + Add(1, 4U, "250", "300", 300000U, 0, 0); + + Add(3, 5U, "120", "130", 6000U); + Add(3, 6U, "140", "150", 6000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_FALSE(compaction->IsTrivialMove()); +} + +TEST_F(CompactionPickerTest, CacheNextCompactionIndex) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.max_compaction_bytes = 100000000000u; + + Add(1 /* level */, 1U /* file_number */, "100" /* smallest */, + "149" /* largest */, 1000000000U /* file_size */); + file_map_[1U].first->being_compacted = true; + Add(1 /* level */, 2U /* file_number */, "150" /* smallest */, + "199" /* largest */, 900000000U /* file_size */); + Add(1 /* level */, 3U /* file_number */, "200" /* smallest */, + "249" /* largest */, 800000000U /* file_size */); + Add(1 /* level */, 4U /* file_number */, "250" /* smallest */, + "299" /* largest */, 700000000U /* file_size */); + Add(2 /* level */, 5U /* file_number */, "150" /* smallest */, + "199" /* largest */, 1U /* file_size */); + file_map_[5U].first->being_compacted = true; + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_levels()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(0U, compaction->num_input_files(1)); + ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2, vstorage_->NextCompactionIndex(1 /* level */)); + + compaction.reset(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_levels()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(0U, compaction->num_input_files(1)); + ASSERT_EQ(4U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(3, vstorage_->NextCompactionIndex(1 /* level */)); + + compaction.reset(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() == nullptr); + ASSERT_EQ(4, vstorage_->NextCompactionIndex(1 /* level */)); +} + +TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesNotHit) { + // Intra L0 compaction triggers only if there are at least + // level0_file_num_compaction_trigger + 2 L0 files. + mutable_cf_options_.level0_file_num_compaction_trigger = 3; + mutable_cf_options_.max_compaction_bytes = 1000000u; + NewVersionStorage(6, kCompactionStyleLevel); + + // All 5 L0 files will be picked for intra L0 compaction. The one L1 file + // spans entire L0 key range and is marked as being compacted to avoid + // L0->L1 compaction. + Add(0, 1U, "100", "150", 200000U, 0, 100, 101); + Add(0, 2U, "151", "200", 200000U, 0, 102, 103); + Add(0, 3U, "201", "250", 200000U, 0, 104, 105); + Add(0, 4U, "251", "300", 200000U, 0, 106, 107); + Add(0, 5U, "301", "350", 200000U, 0, 108, 109); + Add(1, 6U, "100", "350", 200000U, 0, 110, 111); + vstorage_->LevelFiles(1)[0]->being_compacted = true; + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_levels()); + ASSERT_EQ(5U, compaction->num_input_files(0)); + ASSERT_EQ(CompactionReason::kLevelL0FilesNum, + compaction->compaction_reason()); + ASSERT_EQ(0, compaction->output_level()); +} + +TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesHit) { + // Intra L0 compaction triggers only if there are at least + // level0_file_num_compaction_trigger + 2 L0 files. + mutable_cf_options_.level0_file_num_compaction_trigger = 3; + mutable_cf_options_.max_compaction_bytes = 999999u; + NewVersionStorage(6, kCompactionStyleLevel); + + // 4 out of 5 L0 files will be picked for intra L0 compaction due to + // max_compaction_bytes limit (the minimum number of files for triggering + // intra L0 compaction is 4). The one L1 file spans entire L0 key range and + // is marked as being compacted to avoid L0->L1 compaction. + Add(0, 1U, "100", "150", 200000U, 0, 100, 101); + Add(0, 2U, "151", "200", 200000U, 0, 102, 103); + Add(0, 3U, "201", "250", 200000U, 0, 104, 105); + Add(0, 4U, "251", "300", 200000U, 0, 106, 107); + Add(0, 5U, "301", "350", 200000U, 0, 108, 109); + Add(1, 6U, "100", "350", 200000U, 0, 109, 110); + vstorage_->LevelFiles(1)[0]->being_compacted = true; + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_levels()); + ASSERT_EQ(4U, compaction->num_input_files(0)); + ASSERT_EQ(CompactionReason::kLevelL0FilesNum, + compaction->compaction_reason()); + ASSERT_EQ(0, compaction->output_level()); +} + +TEST_F(CompactionPickerTest, IntraL0ForEarliestSeqno) { + // Intra L0 compaction triggers only if there are at least + // level0_file_num_compaction_trigger + 2 L0 files. + mutable_cf_options_.level0_file_num_compaction_trigger = 3; + mutable_cf_options_.max_compaction_bytes = 999999u; + NewVersionStorage(6, kCompactionStyleLevel); + + // 4 out of 6 L0 files will be picked for intra L0 compaction due to + // being_compact limit. And the latest one L0 will be skipped due to earliest + // seqno. The one L1 file spans entire L0 key range and is marked as being + // compacted to avoid L0->L1 compaction. + Add(1, 1U, "100", "350", 200000U, 0, 110, 111); + Add(0, 2U, "301", "350", 1U, 0, 108, 109); + Add(0, 3U, "251", "300", 1U, 0, 106, 107); + Add(0, 4U, "201", "250", 1U, 0, 104, 105); + Add(0, 5U, "151", "200", 1U, 0, 102, 103); + Add(0, 6U, "100", "150", 1U, 0, 100, 101); + Add(0, 7U, "100", "100", 1U, 0, 99, 100); + vstorage_->LevelFiles(0)[5]->being_compacted = true; + vstorage_->LevelFiles(1)[0]->being_compacted = true; + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_, 107)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_levels()); + ASSERT_EQ(4U, compaction->num_input_files(0)); + ASSERT_EQ(CompactionReason::kLevelL0FilesNum, + compaction->compaction_reason()); + ASSERT_EQ(0, compaction->output_level()); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/compaction/compaction_picker_universal.cc b/src/rocksdb/db/compaction/compaction_picker_universal.cc new file mode 100644 index 000000000..d8b63956e --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_picker_universal.cc @@ -0,0 +1,1105 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction_picker_universal.h" +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include +#include +#include "db/column_family.h" +#include "file/filename.h" +#include "logging/log_buffer.h" +#include "monitoring/statistics.h" +#include "test_util/sync_point.h" +#include "util/random.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +// A helper class that form universal compactions. The class is used by +// UniversalCompactionPicker::PickCompaction(). +// The usage is to create the class, and get the compaction object by calling +// PickCompaction(). +class UniversalCompactionBuilder { + public: + UniversalCompactionBuilder(const ImmutableCFOptions& ioptions, + const InternalKeyComparator* icmp, + const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, + UniversalCompactionPicker* picker, + LogBuffer* log_buffer) + : ioptions_(ioptions), + icmp_(icmp), + cf_name_(cf_name), + mutable_cf_options_(mutable_cf_options), + vstorage_(vstorage), + picker_(picker), + log_buffer_(log_buffer) {} + + // Form and return the compaction object. The caller owns return object. + Compaction* PickCompaction(); + + private: + struct SortedRun { + SortedRun(int _level, FileMetaData* _file, uint64_t _size, + uint64_t _compensated_file_size, bool _being_compacted) + : level(_level), + file(_file), + size(_size), + compensated_file_size(_compensated_file_size), + being_compacted(_being_compacted) { + assert(compensated_file_size > 0); + assert(level != 0 || file != nullptr); + } + + void Dump(char* out_buf, size_t out_buf_size, + bool print_path = false) const; + + // sorted_run_count is added into the string to print + void DumpSizeInfo(char* out_buf, size_t out_buf_size, + size_t sorted_run_count) const; + + int level; + // `file` Will be null for level > 0. For level = 0, the sorted run is + // for this file. + FileMetaData* file; + // For level > 0, `size` and `compensated_file_size` are sum of sizes all + // files in the level. `being_compacted` should be the same for all files + // in a non-zero level. Use the value here. + uint64_t size; + uint64_t compensated_file_size; + bool being_compacted; + }; + + // Pick Universal compaction to limit read amplification + Compaction* PickCompactionToReduceSortedRuns( + unsigned int ratio, unsigned int max_number_of_files_to_compact); + + // Pick Universal compaction to limit space amplification. + Compaction* PickCompactionToReduceSizeAmp(); + + Compaction* PickDeleteTriggeredCompaction(); + + // Form a compaction from the sorted run indicated by start_index to the + // oldest sorted run. + // The caller is responsible for making sure that those files are not in + // compaction. + Compaction* PickCompactionToOldest(size_t start_index, + CompactionReason compaction_reason); + + // Try to pick periodic compaction. The caller should only call it + // if there is at least one file marked for periodic compaction. + // null will be returned if no such a compaction can be formed + // because some files are being compacted. + Compaction* PickPeriodicCompaction(); + + // Used in universal compaction when the enabled_trivial_move + // option is set. Checks whether there are any overlapping files + // in the input. Returns true if the input files are non + // overlapping. + bool IsInputFilesNonOverlapping(Compaction* c); + + const ImmutableCFOptions& ioptions_; + const InternalKeyComparator* icmp_; + double score_; + std::vector sorted_runs_; + const std::string& cf_name_; + const MutableCFOptions& mutable_cf_options_; + VersionStorageInfo* vstorage_; + UniversalCompactionPicker* picker_; + LogBuffer* log_buffer_; + + static std::vector CalculateSortedRuns( + const VersionStorageInfo& vstorage, const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options); + + // Pick a path ID to place a newly generated file, with its estimated file + // size. + static uint32_t GetPathId(const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options, + uint64_t file_size); +}; + +// Used in universal compaction when trivial move is enabled. +// This structure is used for the construction of min heap +// that contains the file meta data, the level of the file +// and the index of the file in that level + +struct InputFileInfo { + InputFileInfo() : f(nullptr), level(0), index(0) {} + + FileMetaData* f; + size_t level; + size_t index; +}; + +// Used in universal compaction when trivial move is enabled. +// This comparator is used for the construction of min heap +// based on the smallest key of the file. +struct SmallestKeyHeapComparator { + explicit SmallestKeyHeapComparator(const Comparator* ucmp) { ucmp_ = ucmp; } + + bool operator()(InputFileInfo i1, InputFileInfo i2) const { + return (ucmp_->Compare(i1.f->smallest.user_key(), + i2.f->smallest.user_key()) > 0); + } + + private: + const Comparator* ucmp_; +}; + +typedef std::priority_queue, + SmallestKeyHeapComparator> + SmallestKeyHeap; + +// This function creates the heap that is used to find if the files are +// overlapping during universal compaction when the allow_trivial_move +// is set. +SmallestKeyHeap create_level_heap(Compaction* c, const Comparator* ucmp) { + SmallestKeyHeap smallest_key_priority_q = + SmallestKeyHeap(SmallestKeyHeapComparator(ucmp)); + + InputFileInfo input_file; + + for (size_t l = 0; l < c->num_input_levels(); l++) { + if (c->num_input_files(l) != 0) { + if (l == 0 && c->start_level() == 0) { + for (size_t i = 0; i < c->num_input_files(0); i++) { + input_file.f = c->input(0, i); + input_file.level = 0; + input_file.index = i; + smallest_key_priority_q.push(std::move(input_file)); + } + } else { + input_file.f = c->input(l, 0); + input_file.level = l; + input_file.index = 0; + smallest_key_priority_q.push(std::move(input_file)); + } + } + } + return smallest_key_priority_q; +} + +#ifndef NDEBUG +// smallest_seqno and largest_seqno are set iff. `files` is not empty. +void GetSmallestLargestSeqno(const std::vector& files, + SequenceNumber* smallest_seqno, + SequenceNumber* largest_seqno) { + bool is_first = true; + for (FileMetaData* f : files) { + assert(f->fd.smallest_seqno <= f->fd.largest_seqno); + if (is_first) { + is_first = false; + *smallest_seqno = f->fd.smallest_seqno; + *largest_seqno = f->fd.largest_seqno; + } else { + if (f->fd.smallest_seqno < *smallest_seqno) { + *smallest_seqno = f->fd.smallest_seqno; + } + if (f->fd.largest_seqno > *largest_seqno) { + *largest_seqno = f->fd.largest_seqno; + } + } + } +} +#endif +} // namespace + +// Algorithm that checks to see if there are any overlapping +// files in the input +bool UniversalCompactionBuilder::IsInputFilesNonOverlapping(Compaction* c) { + auto comparator = icmp_->user_comparator(); + int first_iter = 1; + + InputFileInfo prev, curr, next; + + SmallestKeyHeap smallest_key_priority_q = + create_level_heap(c, icmp_->user_comparator()); + + while (!smallest_key_priority_q.empty()) { + curr = smallest_key_priority_q.top(); + smallest_key_priority_q.pop(); + + if (first_iter) { + prev = curr; + first_iter = 0; + } else { + if (comparator->Compare(prev.f->largest.user_key(), + curr.f->smallest.user_key()) >= 0) { + // found overlapping files, return false + return false; + } + assert(comparator->Compare(curr.f->largest.user_key(), + prev.f->largest.user_key()) > 0); + prev = curr; + } + + next.f = nullptr; + + if (c->level(curr.level) != 0 && + curr.index < c->num_input_files(curr.level) - 1) { + next.f = c->input(curr.level, curr.index + 1); + next.level = curr.level; + next.index = curr.index + 1; + } + + if (next.f) { + smallest_key_priority_q.push(std::move(next)); + } + } + return true; +} + +bool UniversalCompactionPicker::NeedsCompaction( + const VersionStorageInfo* vstorage) const { + const int kLevel0 = 0; + if (vstorage->CompactionScore(kLevel0) >= 1) { + return true; + } + if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) { + return true; + } + if (!vstorage->FilesMarkedForCompaction().empty()) { + return true; + } + return false; +} + +Compaction* UniversalCompactionPicker::PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, LogBuffer* log_buffer, + SequenceNumber /* earliest_memtable_seqno */) { + UniversalCompactionBuilder builder(ioptions_, icmp_, cf_name, + mutable_cf_options, vstorage, this, + log_buffer); + return builder.PickCompaction(); +} + +void UniversalCompactionBuilder::SortedRun::Dump(char* out_buf, + size_t out_buf_size, + bool print_path) const { + if (level == 0) { + assert(file != nullptr); + if (file->fd.GetPathId() == 0 || !print_path) { + snprintf(out_buf, out_buf_size, "file %" PRIu64, file->fd.GetNumber()); + } else { + snprintf(out_buf, out_buf_size, "file %" PRIu64 + "(path " + "%" PRIu32 ")", + file->fd.GetNumber(), file->fd.GetPathId()); + } + } else { + snprintf(out_buf, out_buf_size, "level %d", level); + } +} + +void UniversalCompactionBuilder::SortedRun::DumpSizeInfo( + char* out_buf, size_t out_buf_size, size_t sorted_run_count) const { + if (level == 0) { + assert(file != nullptr); + snprintf(out_buf, out_buf_size, + "file %" PRIu64 "[%" ROCKSDB_PRIszt + "] " + "with size %" PRIu64 " (compensated size %" PRIu64 ")", + file->fd.GetNumber(), sorted_run_count, file->fd.GetFileSize(), + file->compensated_file_size); + } else { + snprintf(out_buf, out_buf_size, + "level %d[%" ROCKSDB_PRIszt + "] " + "with size %" PRIu64 " (compensated size %" PRIu64 ")", + level, sorted_run_count, size, compensated_file_size); + } +} + +std::vector +UniversalCompactionBuilder::CalculateSortedRuns( + const VersionStorageInfo& vstorage, const ImmutableCFOptions& /*ioptions*/, + const MutableCFOptions& mutable_cf_options) { + std::vector ret; + for (FileMetaData* f : vstorage.LevelFiles(0)) { + ret.emplace_back(0, f, f->fd.GetFileSize(), f->compensated_file_size, + f->being_compacted); + } + for (int level = 1; level < vstorage.num_levels(); level++) { + uint64_t total_compensated_size = 0U; + uint64_t total_size = 0U; + bool being_compacted = false; + bool is_first = true; + for (FileMetaData* f : vstorage.LevelFiles(level)) { + total_compensated_size += f->compensated_file_size; + total_size += f->fd.GetFileSize(); + if (mutable_cf_options.compaction_options_universal.allow_trivial_move == + true) { + if (f->being_compacted) { + being_compacted = f->being_compacted; + } + } else { + // Compaction always includes all files for a non-zero level, so for a + // non-zero level, all the files should share the same being_compacted + // value. + // This assumption is only valid when + // mutable_cf_options.compaction_options_universal.allow_trivial_move + // is false + assert(is_first || f->being_compacted == being_compacted); + } + if (is_first) { + being_compacted = f->being_compacted; + is_first = false; + } + } + if (total_compensated_size > 0) { + ret.emplace_back(level, nullptr, total_size, total_compensated_size, + being_compacted); + } + } + return ret; +} + +// Universal style of compaction. Pick files that are contiguous in +// time-range to compact. +Compaction* UniversalCompactionBuilder::PickCompaction() { + const int kLevel0 = 0; + score_ = vstorage_->CompactionScore(kLevel0); + sorted_runs_ = + CalculateSortedRuns(*vstorage_, ioptions_, mutable_cf_options_); + + if (sorted_runs_.size() == 0 || + (vstorage_->FilesMarkedForPeriodicCompaction().empty() && + vstorage_->FilesMarkedForCompaction().empty() && + sorted_runs_.size() < (unsigned int)mutable_cf_options_ + .level0_file_num_compaction_trigger)) { + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: nothing to do\n", + cf_name_.c_str()); + TEST_SYNC_POINT_CALLBACK( + "UniversalCompactionBuilder::PickCompaction:Return", nullptr); + return nullptr; + } + VersionStorageInfo::LevelSummaryStorage tmp; + ROCKS_LOG_BUFFER_MAX_SZ( + log_buffer_, 3072, + "[%s] Universal: sorted runs files(%" ROCKSDB_PRIszt "): %s\n", + cf_name_.c_str(), sorted_runs_.size(), vstorage_->LevelSummary(&tmp)); + + Compaction* c = nullptr; + // Periodic compaction has higher priority than other type of compaction + // because it's a hard requirement. + if (!vstorage_->FilesMarkedForPeriodicCompaction().empty()) { + // Always need to do a full compaction for periodic compaction. + c = PickPeriodicCompaction(); + } + + // Check for size amplification. + if (c == nullptr && + sorted_runs_.size() >= + static_cast( + mutable_cf_options_.level0_file_num_compaction_trigger)) { + if ((c = PickCompactionToReduceSizeAmp()) != nullptr) { + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: compacting for size amp\n", + cf_name_.c_str()); + } else { + // Size amplification is within limits. Try reducing read + // amplification while maintaining file size ratios. + unsigned int ratio = + mutable_cf_options_.compaction_options_universal.size_ratio; + + if ((c = PickCompactionToReduceSortedRuns(ratio, UINT_MAX)) != nullptr) { + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: compacting for size ratio\n", + cf_name_.c_str()); + } else { + // Size amplification and file size ratios are within configured limits. + // If max read amplification is exceeding configured limits, then force + // compaction without looking at filesize ratios and try to reduce + // the number of files to fewer than level0_file_num_compaction_trigger. + // This is guaranteed by NeedsCompaction() + assert(sorted_runs_.size() >= + static_cast( + mutable_cf_options_.level0_file_num_compaction_trigger)); + // Get the total number of sorted runs that are not being compacted + int num_sr_not_compacted = 0; + for (size_t i = 0; i < sorted_runs_.size(); i++) { + if (sorted_runs_[i].being_compacted == false) { + num_sr_not_compacted++; + } + } + + // The number of sorted runs that are not being compacted is greater + // than the maximum allowed number of sorted runs + if (num_sr_not_compacted > + mutable_cf_options_.level0_file_num_compaction_trigger) { + unsigned int num_files = + num_sr_not_compacted - + mutable_cf_options_.level0_file_num_compaction_trigger + 1; + if ((c = PickCompactionToReduceSortedRuns(UINT_MAX, num_files)) != + nullptr) { + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: compacting for file num -- %u\n", + cf_name_.c_str(), num_files); + } + } + } + } + } + + if (c == nullptr) { + if ((c = PickDeleteTriggeredCompaction()) != nullptr) { + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: delete triggered compaction\n", + cf_name_.c_str()); + } + } + + if (c == nullptr) { + TEST_SYNC_POINT_CALLBACK( + "UniversalCompactionBuilder::PickCompaction:Return", nullptr); + return nullptr; + } + + if (mutable_cf_options_.compaction_options_universal.allow_trivial_move == + true && + c->compaction_reason() != CompactionReason::kPeriodicCompaction) { + c->set_is_trivial_move(IsInputFilesNonOverlapping(c)); + } + +// validate that all the chosen files of L0 are non overlapping in time +#ifndef NDEBUG + SequenceNumber prev_smallest_seqno = 0U; + bool is_first = true; + + size_t level_index = 0U; + if (c->start_level() == 0) { + for (auto f : *c->inputs(0)) { + assert(f->fd.smallest_seqno <= f->fd.largest_seqno); + if (is_first) { + is_first = false; + } + prev_smallest_seqno = f->fd.smallest_seqno; + } + level_index = 1U; + } + for (; level_index < c->num_input_levels(); level_index++) { + if (c->num_input_files(level_index) != 0) { + SequenceNumber smallest_seqno = 0U; + SequenceNumber largest_seqno = 0U; + GetSmallestLargestSeqno(*(c->inputs(level_index)), &smallest_seqno, + &largest_seqno); + if (is_first) { + is_first = false; + } else if (prev_smallest_seqno > 0) { + // A level is considered as the bottommost level if there are + // no files in higher levels or if files in higher levels do + // not overlap with the files being compacted. Sequence numbers + // of files in bottommost level can be set to 0 to help + // compression. As a result, the following assert may not hold + // if the prev_smallest_seqno is 0. + assert(prev_smallest_seqno > largest_seqno); + } + prev_smallest_seqno = smallest_seqno; + } + } +#endif + // update statistics + RecordInHistogram(ioptions_.statistics, NUM_FILES_IN_SINGLE_COMPACTION, + c->inputs(0)->size()); + + picker_->RegisterCompaction(c); + vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_); + + TEST_SYNC_POINT_CALLBACK("UniversalCompactionBuilder::PickCompaction:Return", + c); + return c; +} + +uint32_t UniversalCompactionBuilder::GetPathId( + const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options, uint64_t file_size) { + // Two conditions need to be satisfied: + // (1) the target path needs to be able to hold the file's size + // (2) Total size left in this and previous paths need to be not + // smaller than expected future file size before this new file is + // compacted, which is estimated based on size_ratio. + // For example, if now we are compacting files of size (1, 1, 2, 4, 8), + // we will make sure the target file, probably with size of 16, will be + // placed in a path so that eventually when new files are generated and + // compacted to (1, 1, 2, 4, 8, 16), all those files can be stored in or + // before the path we chose. + // + // TODO(sdong): now the case of multiple column families is not + // considered in this algorithm. So the target size can be violated in + // that case. We need to improve it. + uint64_t accumulated_size = 0; + uint64_t future_size = + file_size * + (100 - mutable_cf_options.compaction_options_universal.size_ratio) / 100; + uint32_t p = 0; + assert(!ioptions.cf_paths.empty()); + for (; p < ioptions.cf_paths.size() - 1; p++) { + uint64_t target_size = ioptions.cf_paths[p].target_size; + if (target_size > file_size && + accumulated_size + (target_size - file_size) > future_size) { + return p; + } + accumulated_size += target_size; + } + return p; +} + +// +// Consider compaction files based on their size differences with +// the next file in time order. +// +Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns( + unsigned int ratio, unsigned int max_number_of_files_to_compact) { + unsigned int min_merge_width = + mutable_cf_options_.compaction_options_universal.min_merge_width; + unsigned int max_merge_width = + mutable_cf_options_.compaction_options_universal.max_merge_width; + + const SortedRun* sr = nullptr; + bool done = false; + size_t start_index = 0; + unsigned int candidate_count = 0; + + unsigned int max_files_to_compact = + std::min(max_merge_width, max_number_of_files_to_compact); + min_merge_width = std::max(min_merge_width, 2U); + + // Caller checks the size before executing this function. This invariant is + // important because otherwise we may have a possible integer underflow when + // dealing with unsigned types. + assert(sorted_runs_.size() > 0); + + // Considers a candidate file only if it is smaller than the + // total size accumulated so far. + for (size_t loop = 0; loop < sorted_runs_.size(); loop++) { + candidate_count = 0; + + // Skip files that are already being compacted + for (sr = nullptr; loop < sorted_runs_.size(); loop++) { + sr = &sorted_runs_[loop]; + + if (!sr->being_compacted) { + candidate_count = 1; + break; + } + char file_num_buf[kFormatFileNumberBufSize]; + sr->Dump(file_num_buf, sizeof(file_num_buf)); + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: %s" + "[%d] being compacted, skipping", + cf_name_.c_str(), file_num_buf, loop); + + sr = nullptr; + } + + // This file is not being compacted. Consider it as the + // first candidate to be compacted. + uint64_t candidate_size = sr != nullptr ? sr->compensated_file_size : 0; + if (sr != nullptr) { + char file_num_buf[kFormatFileNumberBufSize]; + sr->Dump(file_num_buf, sizeof(file_num_buf), true); + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: Possible candidate %s[%d].", + cf_name_.c_str(), file_num_buf, loop); + } + + // Check if the succeeding files need compaction. + for (size_t i = loop + 1; + candidate_count < max_files_to_compact && i < sorted_runs_.size(); + i++) { + const SortedRun* succeeding_sr = &sorted_runs_[i]; + if (succeeding_sr->being_compacted) { + break; + } + // Pick files if the total/last candidate file size (increased by the + // specified ratio) is still larger than the next candidate file. + // candidate_size is the total size of files picked so far with the + // default kCompactionStopStyleTotalSize; with + // kCompactionStopStyleSimilarSize, it's simply the size of the last + // picked file. + double sz = candidate_size * (100.0 + ratio) / 100.0; + if (sz < static_cast(succeeding_sr->size)) { + break; + } + if (mutable_cf_options_.compaction_options_universal.stop_style == + kCompactionStopStyleSimilarSize) { + // Similar-size stopping rule: also check the last picked file isn't + // far larger than the next candidate file. + sz = (succeeding_sr->size * (100.0 + ratio)) / 100.0; + if (sz < static_cast(candidate_size)) { + // If the small file we've encountered begins a run of similar-size + // files, we'll pick them up on a future iteration of the outer + // loop. If it's some lonely straggler, it'll eventually get picked + // by the last-resort read amp strategy which disregards size ratios. + break; + } + candidate_size = succeeding_sr->compensated_file_size; + } else { // default kCompactionStopStyleTotalSize + candidate_size += succeeding_sr->compensated_file_size; + } + candidate_count++; + } + + // Found a series of consecutive files that need compaction. + if (candidate_count >= (unsigned int)min_merge_width) { + start_index = loop; + done = true; + break; + } else { + for (size_t i = loop; + i < loop + candidate_count && i < sorted_runs_.size(); i++) { + const SortedRun* skipping_sr = &sorted_runs_[i]; + char file_num_buf[256]; + skipping_sr->DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop); + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Skipping %s", + cf_name_.c_str(), file_num_buf); + } + } + } + if (!done || candidate_count <= 1) { + return nullptr; + } + size_t first_index_after = start_index + candidate_count; + // Compression is enabled if files compacted earlier already reached + // size ratio of compression. + bool enable_compression = true; + int ratio_to_compress = + mutable_cf_options_.compaction_options_universal.compression_size_percent; + if (ratio_to_compress >= 0) { + uint64_t total_size = 0; + for (auto& sorted_run : sorted_runs_) { + total_size += sorted_run.compensated_file_size; + } + + uint64_t older_file_size = 0; + for (size_t i = sorted_runs_.size() - 1; i >= first_index_after; i--) { + older_file_size += sorted_runs_[i].size; + if (older_file_size * 100L >= total_size * (long)ratio_to_compress) { + enable_compression = false; + break; + } + } + } + + uint64_t estimated_total_size = 0; + for (unsigned int i = 0; i < first_index_after; i++) { + estimated_total_size += sorted_runs_[i].size; + } + uint32_t path_id = + GetPathId(ioptions_, mutable_cf_options_, estimated_total_size); + int start_level = sorted_runs_[start_index].level; + int output_level; + if (first_index_after == sorted_runs_.size()) { + output_level = vstorage_->num_levels() - 1; + } else if (sorted_runs_[first_index_after].level == 0) { + output_level = 0; + } else { + output_level = sorted_runs_[first_index_after].level - 1; + } + + // last level is reserved for the files ingested behind + if (ioptions_.allow_ingest_behind && + (output_level == vstorage_->num_levels() - 1)) { + assert(output_level > 1); + output_level--; + } + + std::vector inputs(vstorage_->num_levels()); + for (size_t i = 0; i < inputs.size(); ++i) { + inputs[i].level = start_level + static_cast(i); + } + for (size_t i = start_index; i < first_index_after; i++) { + auto& picking_sr = sorted_runs_[i]; + if (picking_sr.level == 0) { + FileMetaData* picking_file = picking_sr.file; + inputs[0].files.push_back(picking_file); + } else { + auto& files = inputs[picking_sr.level - start_level].files; + for (auto* f : vstorage_->LevelFiles(picking_sr.level)) { + files.push_back(f); + } + } + char file_num_buf[256]; + picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), i); + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Picking %s", + cf_name_.c_str(), file_num_buf); + } + + CompactionReason compaction_reason; + if (max_number_of_files_to_compact == UINT_MAX) { + compaction_reason = CompactionReason::kUniversalSizeRatio; + } else { + compaction_reason = CompactionReason::kUniversalSortedRunNum; + } + return new Compaction( + vstorage_, ioptions_, mutable_cf_options_, std::move(inputs), + output_level, + MaxFileSizeForLevel(mutable_cf_options_, output_level, + kCompactionStyleUniversal), + LLONG_MAX, path_id, + GetCompressionType(ioptions_, vstorage_, mutable_cf_options_, start_level, + 1, enable_compression), + GetCompressionOptions(ioptions_, vstorage_, start_level, + enable_compression), + /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false, + score_, false /* deletion_compaction */, compaction_reason); +} + +// Look at overall size amplification. If size amplification +// exceeeds the configured value, then do a compaction +// of the candidate files all the way upto the earliest +// base file (overrides configured values of file-size ratios, +// min_merge_width and max_merge_width). +// +Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() { + // percentage flexibility while reducing size amplification + uint64_t ratio = mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent; + + unsigned int candidate_count = 0; + uint64_t candidate_size = 0; + size_t start_index = 0; + const SortedRun* sr = nullptr; + + assert(!sorted_runs_.empty()); + if (sorted_runs_.back().being_compacted) { + return nullptr; + } + + // Skip files that are already being compacted + for (size_t loop = 0; loop < sorted_runs_.size() - 1; loop++) { + sr = &sorted_runs_[loop]; + if (!sr->being_compacted) { + start_index = loop; // Consider this as the first candidate. + break; + } + char file_num_buf[kFormatFileNumberBufSize]; + sr->Dump(file_num_buf, sizeof(file_num_buf), true); + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: skipping %s[%d] compacted %s", + cf_name_.c_str(), file_num_buf, loop, + " cannot be a candidate to reduce size amp.\n"); + sr = nullptr; + } + + if (sr == nullptr) { + return nullptr; // no candidate files + } + { + char file_num_buf[kFormatFileNumberBufSize]; + sr->Dump(file_num_buf, sizeof(file_num_buf), true); + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] Universal: First candidate %s[%" ROCKSDB_PRIszt "] %s", + cf_name_.c_str(), file_num_buf, start_index, " to reduce size amp.\n"); + } + + // keep adding up all the remaining files + for (size_t loop = start_index; loop < sorted_runs_.size() - 1; loop++) { + sr = &sorted_runs_[loop]; + if (sr->being_compacted) { + char file_num_buf[kFormatFileNumberBufSize]; + sr->Dump(file_num_buf, sizeof(file_num_buf), true); + ROCKS_LOG_BUFFER( + log_buffer_, "[%s] Universal: Possible candidate %s[%d] %s", + cf_name_.c_str(), file_num_buf, start_index, + " is already being compacted. No size amp reduction possible.\n"); + return nullptr; + } + candidate_size += sr->compensated_file_size; + candidate_count++; + } + if (candidate_count == 0) { + return nullptr; + } + + // size of earliest file + uint64_t earliest_file_size = sorted_runs_.back().size; + + // size amplification = percentage of additional size + if (candidate_size * 100 < ratio * earliest_file_size) { + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] Universal: size amp not needed. newer-files-total-size %" PRIu64 + " earliest-file-size %" PRIu64, + cf_name_.c_str(), candidate_size, earliest_file_size); + return nullptr; + } else { + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] Universal: size amp needed. newer-files-total-size %" PRIu64 + " earliest-file-size %" PRIu64, + cf_name_.c_str(), candidate_size, earliest_file_size); + } + return PickCompactionToOldest(start_index, + CompactionReason::kUniversalSizeAmplification); +} + +// Pick files marked for compaction. Typically, files are marked by +// CompactOnDeleteCollector due to the presence of tombstones. +Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() { + CompactionInputFiles start_level_inputs; + int output_level; + std::vector inputs; + + if (vstorage_->num_levels() == 1) { + // This is single level universal. Since we're basically trying to reclaim + // space by processing files marked for compaction due to high tombstone + // density, let's do the same thing as compaction to reduce size amp which + // has the same goals. + bool compact = false; + + start_level_inputs.level = 0; + start_level_inputs.files.clear(); + output_level = 0; + for (FileMetaData* f : vstorage_->LevelFiles(0)) { + if (f->marked_for_compaction) { + compact = true; + } + if (compact) { + start_level_inputs.files.push_back(f); + } + } + if (start_level_inputs.size() <= 1) { + // If only the last file in L0 is marked for compaction, ignore it + return nullptr; + } + inputs.push_back(start_level_inputs); + } else { + int start_level; + + // For multi-level universal, the strategy is to make this look more like + // leveled. We pick one of the files marked for compaction and compact with + // overlapping files in the adjacent level. + picker_->PickFilesMarkedForCompaction(cf_name_, vstorage_, &start_level, + &output_level, &start_level_inputs); + if (start_level_inputs.empty()) { + return nullptr; + } + + // Pick the first non-empty level after the start_level + for (output_level = start_level + 1; output_level < vstorage_->num_levels(); + output_level++) { + if (vstorage_->NumLevelFiles(output_level) != 0) { + break; + } + } + + // If all higher levels are empty, pick the highest level as output level + if (output_level == vstorage_->num_levels()) { + if (start_level == 0) { + output_level = vstorage_->num_levels() - 1; + } else { + // If start level is non-zero and all higher levels are empty, this + // compaction will translate into a trivial move. Since the idea is + // to reclaim space and trivial move doesn't help with that, we + // skip compaction in this case and return nullptr + return nullptr; + } + } + if (ioptions_.allow_ingest_behind && + output_level == vstorage_->num_levels() - 1) { + assert(output_level > 1); + output_level--; + } + + if (output_level != 0) { + if (start_level == 0) { + if (!picker_->GetOverlappingL0Files(vstorage_, &start_level_inputs, + output_level, nullptr)) { + return nullptr; + } + } + + CompactionInputFiles output_level_inputs; + int parent_index = -1; + + output_level_inputs.level = output_level; + if (!picker_->SetupOtherInputs(cf_name_, mutable_cf_options_, vstorage_, + &start_level_inputs, &output_level_inputs, + &parent_index, -1)) { + return nullptr; + } + inputs.push_back(start_level_inputs); + if (!output_level_inputs.empty()) { + inputs.push_back(output_level_inputs); + } + if (picker_->FilesRangeOverlapWithCompaction(inputs, output_level)) { + return nullptr; + } + } else { + inputs.push_back(start_level_inputs); + } + } + + uint64_t estimated_total_size = 0; + // Use size of the output level as estimated file size + for (FileMetaData* f : vstorage_->LevelFiles(output_level)) { + estimated_total_size += f->fd.GetFileSize(); + } + uint32_t path_id = + GetPathId(ioptions_, mutable_cf_options_, estimated_total_size); + return new Compaction( + vstorage_, ioptions_, mutable_cf_options_, std::move(inputs), + output_level, + MaxFileSizeForLevel(mutable_cf_options_, output_level, + kCompactionStyleUniversal), + /* max_grandparent_overlap_bytes */ LLONG_MAX, path_id, + GetCompressionType(ioptions_, vstorage_, mutable_cf_options_, + output_level, 1), + GetCompressionOptions(ioptions_, vstorage_, output_level), + /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ true, + score_, false /* deletion_compaction */, + CompactionReason::kFilesMarkedForCompaction); +} + +Compaction* UniversalCompactionBuilder::PickCompactionToOldest( + size_t start_index, CompactionReason compaction_reason) { + assert(start_index < sorted_runs_.size()); + + // Estimate total file size + uint64_t estimated_total_size = 0; + for (size_t loop = start_index; loop < sorted_runs_.size(); loop++) { + estimated_total_size += sorted_runs_[loop].size; + } + uint32_t path_id = + GetPathId(ioptions_, mutable_cf_options_, estimated_total_size); + int start_level = sorted_runs_[start_index].level; + + std::vector inputs(vstorage_->num_levels()); + for (size_t i = 0; i < inputs.size(); ++i) { + inputs[i].level = start_level + static_cast(i); + } + for (size_t loop = start_index; loop < sorted_runs_.size(); loop++) { + auto& picking_sr = sorted_runs_[loop]; + if (picking_sr.level == 0) { + FileMetaData* f = picking_sr.file; + inputs[0].files.push_back(f); + } else { + auto& files = inputs[picking_sr.level - start_level].files; + for (auto* f : vstorage_->LevelFiles(picking_sr.level)) { + files.push_back(f); + } + } + std::string comp_reason_print_string; + if (compaction_reason == CompactionReason::kPeriodicCompaction) { + comp_reason_print_string = "periodic compaction"; + } else if (compaction_reason == + CompactionReason::kUniversalSizeAmplification) { + comp_reason_print_string = "size amp"; + } else { + assert(false); + } + + char file_num_buf[256]; + picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop); + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: %s picking %s", + cf_name_.c_str(), comp_reason_print_string.c_str(), + file_num_buf); + } + + // output files at the bottom most level, unless it's reserved + int output_level = vstorage_->num_levels() - 1; + // last level is reserved for the files ingested behind + if (ioptions_.allow_ingest_behind) { + assert(output_level > 1); + output_level--; + } + + // We never check size for + // compaction_options_universal.compression_size_percent, + // because we always compact all the files, so always compress. + return new Compaction( + vstorage_, ioptions_, mutable_cf_options_, std::move(inputs), + output_level, + MaxFileSizeForLevel(mutable_cf_options_, output_level, + kCompactionStyleUniversal), + LLONG_MAX, path_id, + GetCompressionType(ioptions_, vstorage_, mutable_cf_options_, start_level, + 1, true /* enable_compression */), + GetCompressionOptions(ioptions_, vstorage_, start_level, + true /* enable_compression */), + /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false, + score_, false /* deletion_compaction */, compaction_reason); +} + +Compaction* UniversalCompactionBuilder::PickPeriodicCompaction() { + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Periodic Compaction", + cf_name_.c_str()); + + // In universal compaction, sorted runs contain older data are almost always + // generated earlier too. To simplify the problem, we just try to trigger + // a full compaction. We start from the oldest sorted run and include + // all sorted runs, until we hit a sorted already being compacted. + // Since usually the largest (which is usually the oldest) sorted run is + // included anyway, doing a full compaction won't increase write + // amplification much. + + // Get some information from marked files to check whether a file is + // included in the compaction. + + size_t start_index = sorted_runs_.size(); + while (start_index > 0 && !sorted_runs_[start_index - 1].being_compacted) { + start_index--; + } + if (start_index == sorted_runs_.size()) { + return nullptr; + } + + // There is a rare corner case where we can't pick up all the files + // because some files are being compacted and we end up with picking files + // but none of them need periodic compaction. Unless we simply recompact + // the last sorted run (either the last level or last L0 file), we would just + // execute the compaction, in order to simplify the logic. + if (start_index == sorted_runs_.size() - 1) { + bool included_file_marked = false; + int start_level = sorted_runs_[start_index].level; + FileMetaData* start_file = sorted_runs_[start_index].file; + for (const std::pair& level_file_pair : + vstorage_->FilesMarkedForPeriodicCompaction()) { + if (start_level != 0) { + // Last sorted run is a level + if (start_level == level_file_pair.first) { + included_file_marked = true; + break; + } + } else { + // Last sorted run is a L0 file. + if (start_file == level_file_pair.second) { + included_file_marked = true; + break; + } + } + } + if (!included_file_marked) { + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: Cannot form a compaction covering file " + "marked for periodic compaction", + cf_name_.c_str()); + return nullptr; + } + } + + Compaction* c = PickCompactionToOldest(start_index, + CompactionReason::kPeriodicCompaction); + + TEST_SYNC_POINT_CALLBACK( + "UniversalCompactionPicker::PickPeriodicCompaction:Return", c); + + return c; +} +} // namespace ROCKSDB_NAMESPACE + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/compaction/compaction_picker_universal.h b/src/rocksdb/db/compaction/compaction_picker_universal.h new file mode 100644 index 000000000..c3f55f5d3 --- /dev/null +++ b/src/rocksdb/db/compaction/compaction_picker_universal.h @@ -0,0 +1,31 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#ifndef ROCKSDB_LITE + +#include "db/compaction/compaction_picker.h" + +namespace ROCKSDB_NAMESPACE { +class UniversalCompactionPicker : public CompactionPicker { + public: + UniversalCompactionPicker(const ImmutableCFOptions& ioptions, + const InternalKeyComparator* icmp) + : CompactionPicker(ioptions, icmp) {} + virtual Compaction* PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, LogBuffer* log_buffer, + SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override; + virtual int MaxOutputLevel() const override { return NumberLevels() - 1; } + + virtual bool NeedsCompaction( + const VersionStorageInfo* vstorage) const override; +}; +} // namespace ROCKSDB_NAMESPACE +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/comparator_db_test.cc b/src/rocksdb/db/comparator_db_test.cc new file mode 100644 index 000000000..49f287a97 --- /dev/null +++ b/src/rocksdb/db/comparator_db_test.cc @@ -0,0 +1,660 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#include +#include +#include + +#include "memtable/stl_wrappers.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/hash.h" +#include "util/kv_map.h" +#include "util/string_util.h" +#include "utilities/merge_operators.h" + +using std::unique_ptr; + +namespace ROCKSDB_NAMESPACE { +namespace { + +static const Comparator* kTestComparator = nullptr; + +class KVIter : public Iterator { + public: + explicit KVIter(const stl_wrappers::KVMap* map) + : map_(map), iter_(map_->end()) {} + bool Valid() const override { return iter_ != map_->end(); } + void SeekToFirst() override { iter_ = map_->begin(); } + void SeekToLast() override { + if (map_->empty()) { + iter_ = map_->end(); + } else { + iter_ = map_->find(map_->rbegin()->first); + } + } + void Seek(const Slice& k) override { + iter_ = map_->lower_bound(k.ToString()); + } + void SeekForPrev(const Slice& k) override { + iter_ = map_->upper_bound(k.ToString()); + Prev(); + } + void Next() override { ++iter_; } + void Prev() override { + if (iter_ == map_->begin()) { + iter_ = map_->end(); + return; + } + --iter_; + } + + Slice key() const override { return iter_->first; } + Slice value() const override { return iter_->second; } + Status status() const override { return Status::OK(); } + + private: + const stl_wrappers::KVMap* const map_; + stl_wrappers::KVMap::const_iterator iter_; +}; + +void AssertItersEqual(Iterator* iter1, Iterator* iter2) { + ASSERT_EQ(iter1->Valid(), iter2->Valid()); + if (iter1->Valid()) { + ASSERT_EQ(iter1->key().ToString(), iter2->key().ToString()); + ASSERT_EQ(iter1->value().ToString(), iter2->value().ToString()); + } +} + +// Measuring operations on DB (expect to be empty). +// source_strings are candidate keys +void DoRandomIteraratorTest(DB* db, std::vector source_strings, + Random* rnd, int num_writes, int num_iter_ops, + int num_trigger_flush) { + stl_wrappers::KVMap map((stl_wrappers::LessOfComparator(kTestComparator))); + + for (int i = 0; i < num_writes; i++) { + if (num_trigger_flush > 0 && i != 0 && i % num_trigger_flush == 0) { + db->Flush(FlushOptions()); + } + + int type = rnd->Uniform(2); + int index = rnd->Uniform(static_cast(source_strings.size())); + auto& key = source_strings[index]; + switch (type) { + case 0: + // put + map[key] = key; + ASSERT_OK(db->Put(WriteOptions(), key, key)); + break; + case 1: + // delete + if (map.find(key) != map.end()) { + map.erase(key); + } + ASSERT_OK(db->Delete(WriteOptions(), key)); + break; + default: + assert(false); + } + } + + std::unique_ptr iter(db->NewIterator(ReadOptions())); + std::unique_ptr result_iter(new KVIter(&map)); + + bool is_valid = false; + for (int i = 0; i < num_iter_ops; i++) { + // Random walk and make sure iter and result_iter returns the + // same key and value + int type = rnd->Uniform(6); + ASSERT_OK(iter->status()); + switch (type) { + case 0: + // Seek to First + iter->SeekToFirst(); + result_iter->SeekToFirst(); + break; + case 1: + // Seek to last + iter->SeekToLast(); + result_iter->SeekToLast(); + break; + case 2: { + // Seek to random key + auto key_idx = rnd->Uniform(static_cast(source_strings.size())); + auto key = source_strings[key_idx]; + iter->Seek(key); + result_iter->Seek(key); + break; + } + case 3: + // Next + if (is_valid) { + iter->Next(); + result_iter->Next(); + } else { + continue; + } + break; + case 4: + // Prev + if (is_valid) { + iter->Prev(); + result_iter->Prev(); + } else { + continue; + } + break; + default: { + assert(type == 5); + auto key_idx = rnd->Uniform(static_cast(source_strings.size())); + auto key = source_strings[key_idx]; + std::string result; + auto status = db->Get(ReadOptions(), key, &result); + if (map.find(key) == map.end()) { + ASSERT_TRUE(status.IsNotFound()); + } else { + ASSERT_EQ(map[key], result); + } + break; + } + } + AssertItersEqual(iter.get(), result_iter.get()); + is_valid = iter->Valid(); + } +} + +class DoubleComparator : public Comparator { + public: + DoubleComparator() {} + + const char* Name() const override { return "DoubleComparator"; } + + int Compare(const Slice& a, const Slice& b) const override { +#ifndef CYGWIN + double da = std::stod(a.ToString()); + double db = std::stod(b.ToString()); +#else + double da = std::strtod(a.ToString().c_str(), 0 /* endptr */); + double db = std::strtod(a.ToString().c_str(), 0 /* endptr */); +#endif + if (da == db) { + return a.compare(b); + } else if (da > db) { + return 1; + } else { + return -1; + } + } + void FindShortestSeparator(std::string* /*start*/, + const Slice& /*limit*/) const override {} + + void FindShortSuccessor(std::string* /*key*/) const override {} +}; + +class HashComparator : public Comparator { + public: + HashComparator() {} + + const char* Name() const override { return "HashComparator"; } + + int Compare(const Slice& a, const Slice& b) const override { + uint32_t ha = Hash(a.data(), a.size(), 66); + uint32_t hb = Hash(b.data(), b.size(), 66); + if (ha == hb) { + return a.compare(b); + } else if (ha > hb) { + return 1; + } else { + return -1; + } + } + void FindShortestSeparator(std::string* /*start*/, + const Slice& /*limit*/) const override {} + + void FindShortSuccessor(std::string* /*key*/) const override {} +}; + +class TwoStrComparator : public Comparator { + public: + TwoStrComparator() {} + + const char* Name() const override { return "TwoStrComparator"; } + + int Compare(const Slice& a, const Slice& b) const override { + assert(a.size() >= 2); + assert(b.size() >= 2); + size_t size_a1 = static_cast(a[0]); + size_t size_b1 = static_cast(b[0]); + size_t size_a2 = static_cast(a[1]); + size_t size_b2 = static_cast(b[1]); + assert(size_a1 + size_a2 + 2 == a.size()); + assert(size_b1 + size_b2 + 2 == b.size()); + + Slice a1 = Slice(a.data() + 2, size_a1); + Slice b1 = Slice(b.data() + 2, size_b1); + Slice a2 = Slice(a.data() + 2 + size_a1, size_a2); + Slice b2 = Slice(b.data() + 2 + size_b1, size_b2); + + if (a1 != b1) { + return a1.compare(b1); + } + return a2.compare(b2); + } + void FindShortestSeparator(std::string* /*start*/, + const Slice& /*limit*/) const override {} + + void FindShortSuccessor(std::string* /*key*/) const override {} +}; +} // namespace + +class ComparatorDBTest + : public testing::Test, + virtual public ::testing::WithParamInterface { + private: + std::string dbname_; + Env* env_; + DB* db_; + Options last_options_; + std::unique_ptr comparator_guard; + + public: + ComparatorDBTest() : env_(Env::Default()), db_(nullptr) { + kTestComparator = BytewiseComparator(); + dbname_ = test::PerThreadDBPath("comparator_db_test"); + BlockBasedTableOptions toptions; + toptions.format_version = GetParam(); + last_options_.table_factory.reset( + ROCKSDB_NAMESPACE::NewBlockBasedTableFactory(toptions)); + EXPECT_OK(DestroyDB(dbname_, last_options_)); + } + + ~ComparatorDBTest() override { + delete db_; + EXPECT_OK(DestroyDB(dbname_, last_options_)); + kTestComparator = BytewiseComparator(); + } + + DB* GetDB() { return db_; } + + void SetOwnedComparator(const Comparator* cmp, bool owner = true) { + if (owner) { + comparator_guard.reset(cmp); + } else { + comparator_guard.reset(); + } + kTestComparator = cmp; + last_options_.comparator = cmp; + } + + // Return the current option configuration. + Options* GetOptions() { return &last_options_; } + + void DestroyAndReopen() { + // Destroy using last options + Destroy(); + ASSERT_OK(TryReopen()); + } + + void Destroy() { + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, last_options_)); + } + + Status TryReopen() { + delete db_; + db_ = nullptr; + last_options_.create_if_missing = true; + + return DB::Open(last_options_, dbname_, &db_); + } +}; + +INSTANTIATE_TEST_CASE_P(FormatDef, ComparatorDBTest, + testing::Values(test::kDefaultFormatVersion)); +INSTANTIATE_TEST_CASE_P(FormatLatest, ComparatorDBTest, + testing::Values(test::kLatestFormatVersion)); + +TEST_P(ComparatorDBTest, Bytewise) { + for (int rand_seed = 301; rand_seed < 306; rand_seed++) { + DestroyAndReopen(); + Random rnd(rand_seed); + DoRandomIteraratorTest(GetDB(), + {"a", "b", "c", "d", "e", "f", "g", "h", "i"}, &rnd, + 8, 100, 3); + } +} + +TEST_P(ComparatorDBTest, SimpleSuffixReverseComparator) { + SetOwnedComparator(new test::SimpleSuffixReverseComparator()); + + for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) { + Options* opt = GetOptions(); + opt->comparator = kTestComparator; + DestroyAndReopen(); + Random rnd(rnd_seed); + + std::vector source_strings; + std::vector source_prefixes; + // Randomly generate 5 prefixes + for (int i = 0; i < 5; i++) { + source_prefixes.push_back(test::RandomHumanReadableString(&rnd, 8)); + } + for (int j = 0; j < 20; j++) { + int prefix_index = rnd.Uniform(static_cast(source_prefixes.size())); + std::string key = source_prefixes[prefix_index] + + test::RandomHumanReadableString(&rnd, rnd.Uniform(8)); + source_strings.push_back(key); + } + + DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 30, 600, 66); + } +} + +TEST_P(ComparatorDBTest, Uint64Comparator) { + SetOwnedComparator(test::Uint64Comparator(), false /* owner */); + + for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) { + Options* opt = GetOptions(); + opt->comparator = kTestComparator; + DestroyAndReopen(); + Random rnd(rnd_seed); + Random64 rnd64(rnd_seed); + + std::vector source_strings; + // Randomly generate source keys + for (int i = 0; i < 100; i++) { + uint64_t r = rnd64.Next(); + std::string str; + str.resize(8); + memcpy(&str[0], static_cast(&r), 8); + source_strings.push_back(str); + } + + DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66); + } +} + +TEST_P(ComparatorDBTest, DoubleComparator) { + SetOwnedComparator(new DoubleComparator()); + + for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) { + Options* opt = GetOptions(); + opt->comparator = kTestComparator; + DestroyAndReopen(); + Random rnd(rnd_seed); + + std::vector source_strings; + // Randomly generate source keys + for (int i = 0; i < 100; i++) { + uint32_t r = rnd.Next(); + uint32_t divide_order = rnd.Uniform(8); + double to_divide = 1.0; + for (uint32_t j = 0; j < divide_order; j++) { + to_divide *= 10.0; + } + source_strings.push_back(ToString(r / to_divide)); + } + + DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66); + } +} + +TEST_P(ComparatorDBTest, HashComparator) { + SetOwnedComparator(new HashComparator()); + + for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) { + Options* opt = GetOptions(); + opt->comparator = kTestComparator; + DestroyAndReopen(); + Random rnd(rnd_seed); + + std::vector source_strings; + // Randomly generate source keys + for (int i = 0; i < 100; i++) { + source_strings.push_back(test::RandomKey(&rnd, 8)); + } + + DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66); + } +} + +TEST_P(ComparatorDBTest, TwoStrComparator) { + SetOwnedComparator(new TwoStrComparator()); + + for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) { + Options* opt = GetOptions(); + opt->comparator = kTestComparator; + DestroyAndReopen(); + Random rnd(rnd_seed); + + std::vector source_strings; + // Randomly generate source keys + for (int i = 0; i < 100; i++) { + std::string str; + uint32_t size1 = rnd.Uniform(8); + uint32_t size2 = rnd.Uniform(8); + str.append(1, static_cast(size1)); + str.append(1, static_cast(size2)); + str.append(test::RandomKey(&rnd, size1)); + str.append(test::RandomKey(&rnd, size2)); + source_strings.push_back(str); + } + + DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66); + } +} + +TEST_P(ComparatorDBTest, IsSameLengthImmediateSuccessor) { + { + // different length + Slice s("abcxy"); + Slice t("abcxyz"); + ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + } + { + Slice s("abcxyz"); + Slice t("abcxy"); + ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + } + { + // not last byte different + Slice s("abc1xyz"); + Slice t("abc2xyz"); + ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + } + { + // same string + Slice s("abcxyz"); + Slice t("abcxyz"); + ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + } + { + Slice s("abcxy"); + Slice t("abcxz"); + ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + } + { + Slice s("abcxz"); + Slice t("abcxy"); + ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + } + { + const char s_array[] = "\x50\x8a\xac"; + const char t_array[] = "\x50\x8a\xad"; + Slice s(s_array); + Slice t(t_array); + ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + } + { + const char s_array[] = "\x50\x8a\xff"; + const char t_array[] = "\x50\x8b\x00"; + Slice s(s_array, 3); + Slice t(t_array, 3); + ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + } + { + const char s_array[] = "\x50\x8a\xff\xff"; + const char t_array[] = "\x50\x8b\x00\x00"; + Slice s(s_array, 4); + Slice t(t_array, 4); + ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + } + { + const char s_array[] = "\x50\x8a\xff\xff"; + const char t_array[] = "\x50\x8b\x00\x01"; + Slice s(s_array, 4); + Slice t(t_array, 4); + ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + } +} + +TEST_P(ComparatorDBTest, FindShortestSeparator) { + std::string s1 = "abc1xyz"; + std::string s2 = "abc3xy"; + + BytewiseComparator()->FindShortestSeparator(&s1, s2); + ASSERT_EQ("abc2", s1); + + s1 = "abc5xyztt"; + + ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2); + ASSERT_EQ("abc5", s1); + + s1 = "abc3"; + s2 = "abc2xy"; + ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2); + ASSERT_EQ("abc3", s1); + + s1 = "abc3xyz"; + s2 = "abc2xy"; + ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2); + ASSERT_EQ("abc3", s1); + + s1 = "abc3xyz"; + s2 = "abc2"; + ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2); + ASSERT_EQ("abc3", s1); + + std::string old_s1 = s1 = "abc2xy"; + s2 = "abc2"; + ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2); + ASSERT_TRUE(old_s1 >= s1); + ASSERT_TRUE(s1 > s2); +} + +TEST_P(ComparatorDBTest, SeparatorSuccessorRandomizeTest) { + // Char list for boundary cases. + std::array char_list{{0, 1, 2, 253, 254, 255}}; + Random rnd(301); + + for (int attempts = 0; attempts < 1000; attempts++) { + uint32_t size1 = rnd.Skewed(4); + uint32_t size2; + + if (rnd.OneIn(2)) { + // size2 to be random size + size2 = rnd.Skewed(4); + } else { + // size1 is within [-2, +2] of size1 + int diff = static_cast(rnd.Uniform(5)) - 2; + int tmp_size2 = static_cast(size1) + diff; + if (tmp_size2 < 0) { + tmp_size2 = 0; + } + size2 = static_cast(tmp_size2); + } + + std::string s1; + std::string s2; + for (uint32_t i = 0; i < size1; i++) { + if (rnd.OneIn(2)) { + // Use random byte + s1 += static_cast(rnd.Uniform(256)); + } else { + // Use one byte in char_list + char c = static_cast(char_list[rnd.Uniform(sizeof(char_list))]); + s1 += c; + } + } + + // First set s2 to be the same as s1, and then modify s2. + s2 = s1; + s2.resize(size2); + // We start from the back of the string + if (size2 > 0) { + uint32_t pos = size2 - 1; + do { + if (pos >= size1 || rnd.OneIn(4)) { + // For 1/4 chance, use random byte + s2[pos] = static_cast(rnd.Uniform(256)); + } else if (rnd.OneIn(4)) { + // In 1/4 chance, stop here. + break; + } else { + // Create a char within [-2, +2] of the matching char of s1. + int diff = static_cast(rnd.Uniform(5)) - 2; + // char may be signed or unsigned based on platform. + int s1_char = static_cast(static_cast(s1[pos])); + int s2_char = s1_char + diff; + if (s2_char < 0) { + s2_char = 0; + } + if (s2_char > 255) { + s2_char = 255; + } + s2[pos] = static_cast(s2_char); + } + } while (pos-- != 0); + } + + // Test separators + for (int rev = 0; rev < 2; rev++) { + if (rev == 1) { + // switch s1 and s2 + std::string t = s1; + s1 = s2; + s2 = t; + } + std::string separator = s1; + BytewiseComparator()->FindShortestSeparator(&separator, s2); + std::string rev_separator = s1; + ReverseBytewiseComparator()->FindShortestSeparator(&rev_separator, s2); + + if (s1 == s2) { + ASSERT_EQ(s1, separator); + ASSERT_EQ(s2, rev_separator); + } else if (s1 < s2) { + ASSERT_TRUE(s1 <= separator); + ASSERT_TRUE(s2 > separator); + ASSERT_LE(separator.size(), std::max(s1.size(), s2.size())); + ASSERT_EQ(s1, rev_separator); + } else { + ASSERT_TRUE(s1 >= rev_separator); + ASSERT_TRUE(s2 < rev_separator); + ASSERT_LE(rev_separator.size(), std::max(s1.size(), s2.size())); + ASSERT_EQ(s1, separator); + } + } + + // Test successors + std::string succ = s1; + BytewiseComparator()->FindShortSuccessor(&succ); + ASSERT_TRUE(succ >= s1); + + succ = s1; + ReverseBytewiseComparator()->FindShortSuccessor(&succ); + ASSERT_TRUE(succ <= s1); + } +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/convenience.cc b/src/rocksdb/db/convenience.cc new file mode 100644 index 000000000..206f1f875 --- /dev/null +++ b/src/rocksdb/db/convenience.cc @@ -0,0 +1,77 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#ifndef ROCKSDB_LITE + +#include "rocksdb/convenience.h" + +#include "db/db_impl/db_impl.h" +#include "util/cast_util.h" + +namespace ROCKSDB_NAMESPACE { + +void CancelAllBackgroundWork(DB* db, bool wait) { + (static_cast_with_check(db->GetRootDB())) + ->CancelAllBackgroundWork(wait); +} + +Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end, + bool include_end) { + RangePtr range(begin, end); + return DeleteFilesInRanges(db, column_family, &range, 1, include_end); +} + +Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family, + const RangePtr* ranges, size_t n, + bool include_end) { + return (static_cast_with_check(db->GetRootDB())) + ->DeleteFilesInRanges(column_family, ranges, n, include_end); +} + +Status VerifySstFileChecksum(const Options& options, + const EnvOptions& env_options, + const std::string& file_path) { + return VerifySstFileChecksum(options, env_options, ReadOptions(), file_path); +} +Status VerifySstFileChecksum(const Options& options, + const EnvOptions& env_options, + const ReadOptions& read_options, + const std::string& file_path) { + std::unique_ptr file; + uint64_t file_size; + InternalKeyComparator internal_comparator(options.comparator); + ImmutableCFOptions ioptions(options); + + Status s = ioptions.fs->NewRandomAccessFile(file_path, + FileOptions(env_options), + &file, nullptr); + if (s.ok()) { + s = ioptions.fs->GetFileSize(file_path, IOOptions(), &file_size, nullptr); + } else { + return s; + } + std::unique_ptr table_reader; + std::unique_ptr file_reader( + new RandomAccessFileReader(std::move(file), file_path)); + const bool kImmortal = true; + s = ioptions.table_factory->NewTableReader( + TableReaderOptions(ioptions, options.prefix_extractor.get(), env_options, + internal_comparator, false /* skip_filters */, + !kImmortal, -1 /* level */), + std::move(file_reader), file_size, &table_reader, + false /* prefetch_index_and_filter_in_cache */); + if (!s.ok()) { + return s; + } + s = table_reader->VerifyChecksum(read_options, + TableReaderCaller::kUserVerifyChecksum); + return s; +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/db/corruption_test.cc b/src/rocksdb/db/corruption_test.cc new file mode 100644 index 000000000..203c34fa4 --- /dev/null +++ b/src/rocksdb/db/corruption_test.cc @@ -0,0 +1,613 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef ROCKSDB_LITE + +#include "rocksdb/db.h" + +#include +#include +#include +#include +#include +#include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" +#include "db/log_format.h" +#include "db/version_set.h" +#include "env/composite_env_wrapper.h" +#include "file/filename.h" +#include "rocksdb/cache.h" +#include "rocksdb/convenience.h" +#include "rocksdb/env.h" +#include "rocksdb/table.h" +#include "rocksdb/write_batch.h" +#include "table/block_based/block_based_table_builder.h" +#include "table/meta_blocks.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +static const int kValueSize = 1000; + +class CorruptionTest : public testing::Test { + public: + test::ErrorEnv env_; + std::string dbname_; + std::shared_ptr tiny_cache_; + Options options_; + DB* db_; + + CorruptionTest() { + // If LRU cache shard bit is smaller than 2 (or -1 which will automatically + // set it to 0), test SequenceNumberRecovery will fail, likely because of a + // bug in recovery code. Keep it 4 for now to make the test passes. + tiny_cache_ = NewLRUCache(100, 4); + options_.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords; + options_.env = &env_; + dbname_ = test::PerThreadDBPath("corruption_test"); + DestroyDB(dbname_, options_); + + db_ = nullptr; + options_.create_if_missing = true; + BlockBasedTableOptions table_options; + table_options.block_size_deviation = 0; // make unit test pass for now + options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(); + options_.create_if_missing = false; + } + + ~CorruptionTest() override { + delete db_; + DestroyDB(dbname_, Options()); + } + + void CloseDb() { + delete db_; + db_ = nullptr; + } + + Status TryReopen(Options* options = nullptr) { + delete db_; + db_ = nullptr; + Options opt = (options ? *options : options_); + if (opt.env == Options().env) { + // If env is not overridden, replace it with ErrorEnv. + // Otherwise, the test already uses a non-default Env. + opt.env = &env_; + } + opt.arena_block_size = 4096; + BlockBasedTableOptions table_options; + table_options.block_cache = tiny_cache_; + table_options.block_size_deviation = 0; + opt.table_factory.reset(NewBlockBasedTableFactory(table_options)); + return DB::Open(opt, dbname_, &db_); + } + + void Reopen(Options* options = nullptr) { + ASSERT_OK(TryReopen(options)); + } + + void RepairDB() { + delete db_; + db_ = nullptr; + ASSERT_OK(::ROCKSDB_NAMESPACE::RepairDB(dbname_, options_)); + } + + void Build(int n, int flush_every = 0) { + std::string key_space, value_space; + WriteBatch batch; + for (int i = 0; i < n; i++) { + if (flush_every != 0 && i != 0 && i % flush_every == 0) { + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_FlushMemTable(); + } + //if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n); + Slice key = Key(i, &key_space); + batch.Clear(); + batch.Put(key, Value(i, &value_space)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + } + } + + void Check(int min_expected, int max_expected) { + uint64_t next_expected = 0; + uint64_t missed = 0; + int bad_keys = 0; + int bad_values = 0; + int correct = 0; + std::string value_space; + // Do not verify checksums. If we verify checksums then the + // db itself will raise errors because data is corrupted. + // Instead, we want the reads to be successful and this test + // will detect whether the appropriate corruptions have + // occurred. + Iterator* iter = db_->NewIterator(ReadOptions(false, true)); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + uint64_t key; + Slice in(iter->key()); + if (!ConsumeDecimalNumber(&in, &key) || + !in.empty() || + key < next_expected) { + bad_keys++; + continue; + } + missed += (key - next_expected); + next_expected = key + 1; + if (iter->value() != Value(static_cast(key), &value_space)) { + bad_values++; + } else { + correct++; + } + } + delete iter; + + fprintf(stderr, + "expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%llu\n", + min_expected, max_expected, correct, bad_keys, bad_values, + static_cast(missed)); + ASSERT_LE(min_expected, correct); + ASSERT_GE(max_expected, correct); + } + + void CorruptFile(const std::string& fname, int offset, int bytes_to_corrupt) { + struct stat sbuf; + if (stat(fname.c_str(), &sbuf) != 0) { + const char* msg = strerror(errno); + FAIL() << fname << ": " << msg; + } + + if (offset < 0) { + // Relative to end of file; make it absolute + if (-offset > sbuf.st_size) { + offset = 0; + } else { + offset = static_cast(sbuf.st_size + offset); + } + } + if (offset > sbuf.st_size) { + offset = static_cast(sbuf.st_size); + } + if (offset + bytes_to_corrupt > sbuf.st_size) { + bytes_to_corrupt = static_cast(sbuf.st_size - offset); + } + + // Do it + std::string contents; + Status s = ReadFileToString(Env::Default(), fname, &contents); + ASSERT_TRUE(s.ok()) << s.ToString(); + for (int i = 0; i < bytes_to_corrupt; i++) { + contents[i + offset] ^= 0x80; + } + s = WriteStringToFile(Env::Default(), contents, fname); + ASSERT_TRUE(s.ok()) << s.ToString(); + Options options; + EnvOptions env_options; + options.file_system.reset(new LegacyFileSystemWrapper(options.env)); + ASSERT_NOK(VerifySstFileChecksum(options, env_options, fname)); + } + + void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) { + // Pick file to corrupt + std::vector filenames; + ASSERT_OK(env_.GetChildren(dbname_, &filenames)); + uint64_t number; + FileType type; + std::string fname; + int picked_number = -1; + for (size_t i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &type) && + type == filetype && + static_cast(number) > picked_number) { // Pick latest file + fname = dbname_ + "/" + filenames[i]; + picked_number = static_cast(number); + } + } + ASSERT_TRUE(!fname.empty()) << filetype; + + CorruptFile(fname, offset, bytes_to_corrupt); + } + + // corrupts exactly one file at level `level`. if no file found at level, + // asserts + void CorruptTableFileAtLevel(int level, int offset, int bytes_to_corrupt) { + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + for (const auto& m : metadata) { + if (m.level == level) { + CorruptFile(dbname_ + "/" + m.name, offset, bytes_to_corrupt); + return; + } + } + FAIL() << "no file found at level"; + } + + + int Property(const std::string& name) { + std::string property; + int result; + if (db_->GetProperty(name, &property) && + sscanf(property.c_str(), "%d", &result) == 1) { + return result; + } else { + return -1; + } + } + + // Return the ith key + Slice Key(int i, std::string* storage) { + char buf[100]; + snprintf(buf, sizeof(buf), "%016d", i); + storage->assign(buf, strlen(buf)); + return Slice(*storage); + } + + // Return the value to associate with the specified key + Slice Value(int k, std::string* storage) { + if (k == 0) { + // Ugh. Random seed of 0 used to produce no entropy. This code + // preserves the implementation that was in place when all of the + // magic values in this file were picked. + *storage = std::string(kValueSize, ' '); + return Slice(*storage); + } else { + Random r(k); + return test::RandomString(&r, kValueSize, storage); + } + } +}; + +TEST_F(CorruptionTest, Recovery) { + Build(100); + Check(100, 100); +#ifdef OS_WIN + // On Wndows OS Disk cache does not behave properly + // We do not call FlushBuffers on every Flush. If we do not close + // the log file prior to the corruption we end up with the first + // block not corrupted but only the second. However, under the debugger + // things work just fine but never pass when running normally + // For that reason people may want to run with unbuffered I/O. That option + // is not available for WAL though. + CloseDb(); +#endif + Corrupt(kLogFile, 19, 1); // WriteBatch tag for first record + Corrupt(kLogFile, log::kBlockSize + 1000, 1); // Somewhere in second block + ASSERT_TRUE(!TryReopen().ok()); + options_.paranoid_checks = false; + Reopen(&options_); + + // The 64 records in the first two log blocks are completely lost. + Check(36, 36); +} + +TEST_F(CorruptionTest, RecoverWriteError) { + env_.writable_file_error_ = true; + Status s = TryReopen(); + ASSERT_TRUE(!s.ok()); +} + +TEST_F(CorruptionTest, NewFileErrorDuringWrite) { + // Do enough writing to force minor compaction + env_.writable_file_error_ = true; + const int num = + static_cast(3 + (Options().write_buffer_size / kValueSize)); + std::string value_storage; + Status s; + bool failed = false; + for (int i = 0; i < num; i++) { + WriteBatch batch; + batch.Put("a", Value(100, &value_storage)); + s = db_->Write(WriteOptions(), &batch); + if (!s.ok()) { + failed = true; + } + ASSERT_TRUE(!failed || !s.ok()); + } + ASSERT_TRUE(!s.ok()); + ASSERT_GE(env_.num_writable_file_errors_, 1); + env_.writable_file_error_ = false; + Reopen(); +} + +TEST_F(CorruptionTest, TableFile) { + Build(100); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_FlushMemTable(); + dbi->TEST_CompactRange(0, nullptr, nullptr); + dbi->TEST_CompactRange(1, nullptr, nullptr); + + Corrupt(kTableFile, 100, 1); + Check(99, 99); + ASSERT_NOK(dbi->VerifyChecksum()); +} + +TEST_F(CorruptionTest, VerifyChecksumReadahead) { + Options options; + SpecialEnv senv(Env::Default()); + options.env = &senv; + // Disable block cache as we are going to check checksum for + // the same file twice and measure number of reads. + BlockBasedTableOptions table_options_no_bc; + table_options_no_bc.no_block_cache = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options_no_bc)); + + Reopen(&options); + + Build(10000); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_FlushMemTable(); + dbi->TEST_CompactRange(0, nullptr, nullptr); + dbi->TEST_CompactRange(1, nullptr, nullptr); + + senv.count_random_reads_ = true; + senv.random_read_counter_.Reset(); + ASSERT_OK(dbi->VerifyChecksum()); + + // Make sure the counter is enabled. + ASSERT_GT(senv.random_read_counter_.Read(), 0); + + // The SST file is about 10MB. Default readahead size is 256KB. + // Give a conservative 20 reads for metadata blocks, The number + // of random reads should be within 10 MB / 256KB + 20 = 60. + ASSERT_LT(senv.random_read_counter_.Read(), 60); + + senv.random_read_bytes_counter_ = 0; + ReadOptions ro; + ro.readahead_size = size_t{32 * 1024}; + ASSERT_OK(dbi->VerifyChecksum(ro)); + // The SST file is about 10MB. We set readahead size to 32KB. + // Give 0 to 20 reads for metadata blocks, and allow real read + // to range from 24KB to 48KB. The lower bound would be: + // 10MB / 48KB + 0 = 213 + // The higher bound is + // 10MB / 24KB + 20 = 447. + ASSERT_GE(senv.random_read_counter_.Read(), 213); + ASSERT_LE(senv.random_read_counter_.Read(), 447); + + // Test readahead shouldn't break mmap mode (where it should be + // disabled). + options.allow_mmap_reads = true; + Reopen(&options); + dbi = static_cast(db_); + ASSERT_OK(dbi->VerifyChecksum(ro)); + + CloseDb(); +} + +TEST_F(CorruptionTest, TableFileIndexData) { + Options options; + // very big, we'll trigger flushes manually + options.write_buffer_size = 100 * 1024 * 1024; + Reopen(&options); + // build 2 tables, flush at 5000 + Build(10000, 5000); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_FlushMemTable(); + + // corrupt an index block of an entire file + Corrupt(kTableFile, -2000, 500); + options.paranoid_checks = false; + Reopen(&options); + dbi = reinterpret_cast(db_); + // one full file may be readable, since only one was corrupted + // the other file should be fully non-readable, since index was corrupted + Check(0, 5000); + ASSERT_NOK(dbi->VerifyChecksum()); + + // In paranoid mode, the db cannot be opened due to the corrupted file. + ASSERT_TRUE(TryReopen().IsCorruption()); +} + +TEST_F(CorruptionTest, MissingDescriptor) { + Build(1000); + RepairDB(); + Reopen(); + Check(1000, 1000); +} + +TEST_F(CorruptionTest, SequenceNumberRecovery) { + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5")); + RepairDB(); + Reopen(); + std::string v; + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("v5", v); + // Write something. If sequence number was not recovered properly, + // it will be hidden by an earlier write. + ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6")); + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("v6", v); + Reopen(); + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("v6", v); +} + +TEST_F(CorruptionTest, CorruptedDescriptor) { + ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello")); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_FlushMemTable(); + dbi->TEST_CompactRange(0, nullptr, nullptr); + + Corrupt(kDescriptorFile, 0, 1000); + Status s = TryReopen(); + ASSERT_TRUE(!s.ok()); + + RepairDB(); + Reopen(); + std::string v; + ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("hello", v); +} + +TEST_F(CorruptionTest, CompactionInputError) { + Options options; + Reopen(&options); + Build(10); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_FlushMemTable(); + dbi->TEST_CompactRange(0, nullptr, nullptr); + dbi->TEST_CompactRange(1, nullptr, nullptr); + ASSERT_EQ(1, Property("rocksdb.num-files-at-level2")); + + Corrupt(kTableFile, 100, 1); + Check(9, 9); + ASSERT_NOK(dbi->VerifyChecksum()); + + // Force compactions by writing lots of values + Build(10000); + Check(10000, 10000); + ASSERT_NOK(dbi->VerifyChecksum()); +} + +TEST_F(CorruptionTest, CompactionInputErrorParanoid) { + Options options; + options.paranoid_checks = true; + options.write_buffer_size = 131072; + options.max_write_buffer_number = 2; + Reopen(&options); + DBImpl* dbi = reinterpret_cast(db_); + + // Fill levels >= 1 + for (int level = 1; level < dbi->NumberLevels(); level++) { + dbi->Put(WriteOptions(), "", "begin"); + dbi->Put(WriteOptions(), "~", "end"); + dbi->TEST_FlushMemTable(); + for (int comp_level = 0; comp_level < dbi->NumberLevels() - level; + ++comp_level) { + dbi->TEST_CompactRange(comp_level, nullptr, nullptr); + } + } + + Reopen(&options); + + dbi = reinterpret_cast(db_); + Build(10); + dbi->TEST_FlushMemTable(); + dbi->TEST_WaitForCompact(); + ASSERT_EQ(1, Property("rocksdb.num-files-at-level0")); + + CorruptTableFileAtLevel(0, 100, 1); + Check(9, 9); + ASSERT_NOK(dbi->VerifyChecksum()); + + // Write must eventually fail because of corrupted table + Status s; + std::string tmp1, tmp2; + bool failed = false; + for (int i = 0; i < 10000; i++) { + s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2)); + if (!s.ok()) { + failed = true; + } + // if one write failed, every subsequent write must fail, too + ASSERT_TRUE(!failed || !s.ok()) << "write did not fail in a corrupted db"; + } + ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db"; +} + +TEST_F(CorruptionTest, UnrelatedKeys) { + Build(10); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_FlushMemTable(); + Corrupt(kTableFile, 100, 1); + ASSERT_NOK(dbi->VerifyChecksum()); + + std::string tmp1, tmp2; + ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2))); + std::string v; + ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v)); + ASSERT_EQ(Value(1000, &tmp2).ToString(), v); + dbi->TEST_FlushMemTable(); + ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v)); + ASSERT_EQ(Value(1000, &tmp2).ToString(), v); +} + +TEST_F(CorruptionTest, RangeDeletionCorrupted) { + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "b")); + ASSERT_OK(db_->Flush(FlushOptions())); + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(static_cast(1), metadata.size()); + std::string filename = dbname_ + metadata[0].name; + + std::unique_ptr file; + ASSERT_OK(options_.env->NewRandomAccessFile(filename, &file, EnvOptions())); + std::unique_ptr file_reader( + new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(file), + filename)); + + uint64_t file_size; + ASSERT_OK(options_.env->GetFileSize(filename, &file_size)); + + BlockHandle range_del_handle; + ASSERT_OK(FindMetaBlock( + file_reader.get(), file_size, kBlockBasedTableMagicNumber, + ImmutableCFOptions(options_), kRangeDelBlock, &range_del_handle)); + + ASSERT_OK(TryReopen()); + CorruptFile(filename, static_cast(range_del_handle.offset()), 1); + ASSERT_TRUE(TryReopen().IsCorruption()); +} + +TEST_F(CorruptionTest, FileSystemStateCorrupted) { + for (int iter = 0; iter < 2; ++iter) { + Options options; + options.paranoid_checks = true; + options.create_if_missing = true; + Reopen(&options); + Build(10); + ASSERT_OK(db_->Flush(FlushOptions())); + DBImpl* dbi = reinterpret_cast(db_); + std::vector metadata; + dbi->GetLiveFilesMetaData(&metadata); + ASSERT_GT(metadata.size(), size_t(0)); + std::string filename = dbname_ + metadata[0].name; + + delete db_; + db_ = nullptr; + + if (iter == 0) { // corrupt file size + std::unique_ptr file; + env_.NewWritableFile(filename, &file, EnvOptions()); + file->Append(Slice("corrupted sst")); + file.reset(); + Status x = TryReopen(&options); + ASSERT_TRUE(x.IsCorruption()); + } else { // delete the file + env_.DeleteFile(filename); + Status x = TryReopen(&options); + ASSERT_TRUE(x.IsPathNotFound()); + } + + DestroyDB(dbname_, options_); + } +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "SKIPPED as RepairDB() is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/cuckoo_table_db_test.cc b/src/rocksdb/db/cuckoo_table_db_test.cc new file mode 100644 index 000000000..9467840ff --- /dev/null +++ b/src/rocksdb/db/cuckoo_table_db_test.cc @@ -0,0 +1,351 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "db/db_impl/db_impl.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "table/cuckoo/cuckoo_table_factory.h" +#include "table/cuckoo/cuckoo_table_reader.h" +#include "table/meta_blocks.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +class CuckooTableDBTest : public testing::Test { + private: + std::string dbname_; + Env* env_; + DB* db_; + + public: + CuckooTableDBTest() : env_(Env::Default()) { + dbname_ = test::PerThreadDBPath("cuckoo_table_db_test"); + EXPECT_OK(DestroyDB(dbname_, Options())); + db_ = nullptr; + Reopen(); + } + + ~CuckooTableDBTest() override { + delete db_; + EXPECT_OK(DestroyDB(dbname_, Options())); + } + + Options CurrentOptions() { + Options options; + options.table_factory.reset(NewCuckooTableFactory()); + options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true)); + options.allow_mmap_reads = true; + options.create_if_missing = true; + options.allow_concurrent_memtable_write = false; + return options; + } + + DBImpl* dbfull() { + return reinterpret_cast(db_); + } + + // The following util methods are copied from plain_table_db_test. + void Reopen(Options* options = nullptr) { + delete db_; + db_ = nullptr; + Options opts; + if (options != nullptr) { + opts = *options; + } else { + opts = CurrentOptions(); + opts.create_if_missing = true; + } + ASSERT_OK(DB::Open(opts, dbname_, &db_)); + } + + Status Put(const Slice& k, const Slice& v) { + return db_->Put(WriteOptions(), k, v); + } + + Status Delete(const std::string& k) { + return db_->Delete(WriteOptions(), k); + } + + std::string Get(const std::string& k) { + ReadOptions options; + std::string result; + Status s = db_->Get(options, k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + int NumTableFilesAtLevel(int level) { + std::string property; + EXPECT_TRUE(db_->GetProperty( + "rocksdb.num-files-at-level" + NumberToString(level), &property)); + return atoi(property.c_str()); + } + + // Return spread of files per level + std::string FilesPerLevel() { + std::string result; + size_t last_non_zero_offset = 0; + for (int level = 0; level < db_->NumberLevels(); level++) { + int f = NumTableFilesAtLevel(level); + char buf[100]; + snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f); + result += buf; + if (f > 0) { + last_non_zero_offset = result.size(); + } + } + result.resize(last_non_zero_offset); + return result; + } +}; + +TEST_F(CuckooTableDBTest, Flush) { + // Try with empty DB first. + ASSERT_TRUE(dbfull() != nullptr); + ASSERT_EQ("NOT_FOUND", Get("key2")); + + // Add some values to db. + Options options = CurrentOptions(); + Reopen(&options); + + ASSERT_OK(Put("key1", "v1")); + ASSERT_OK(Put("key2", "v2")); + ASSERT_OK(Put("key3", "v3")); + dbfull()->TEST_FlushMemTable(); + + TablePropertiesCollection ptc; + reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc); + ASSERT_EQ(1U, ptc.size()); + ASSERT_EQ(3U, ptc.begin()->second->num_entries); + ASSERT_EQ("1", FilesPerLevel()); + + ASSERT_EQ("v1", Get("key1")); + ASSERT_EQ("v2", Get("key2")); + ASSERT_EQ("v3", Get("key3")); + ASSERT_EQ("NOT_FOUND", Get("key4")); + + // Now add more keys and flush. + ASSERT_OK(Put("key4", "v4")); + ASSERT_OK(Put("key5", "v5")); + ASSERT_OK(Put("key6", "v6")); + dbfull()->TEST_FlushMemTable(); + + reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc); + ASSERT_EQ(2U, ptc.size()); + auto row = ptc.begin(); + ASSERT_EQ(3U, row->second->num_entries); + ASSERT_EQ(3U, (++row)->second->num_entries); + ASSERT_EQ("2", FilesPerLevel()); + ASSERT_EQ("v1", Get("key1")); + ASSERT_EQ("v2", Get("key2")); + ASSERT_EQ("v3", Get("key3")); + ASSERT_EQ("v4", Get("key4")); + ASSERT_EQ("v5", Get("key5")); + ASSERT_EQ("v6", Get("key6")); + + ASSERT_OK(Delete("key6")); + ASSERT_OK(Delete("key5")); + ASSERT_OK(Delete("key4")); + dbfull()->TEST_FlushMemTable(); + reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc); + ASSERT_EQ(3U, ptc.size()); + row = ptc.begin(); + ASSERT_EQ(3U, row->second->num_entries); + ASSERT_EQ(3U, (++row)->second->num_entries); + ASSERT_EQ(3U, (++row)->second->num_entries); + ASSERT_EQ("3", FilesPerLevel()); + ASSERT_EQ("v1", Get("key1")); + ASSERT_EQ("v2", Get("key2")); + ASSERT_EQ("v3", Get("key3")); + ASSERT_EQ("NOT_FOUND", Get("key4")); + ASSERT_EQ("NOT_FOUND", Get("key5")); + ASSERT_EQ("NOT_FOUND", Get("key6")); +} + +TEST_F(CuckooTableDBTest, FlushWithDuplicateKeys) { + Options options = CurrentOptions(); + Reopen(&options); + ASSERT_OK(Put("key1", "v1")); + ASSERT_OK(Put("key2", "v2")); + ASSERT_OK(Put("key1", "v3")); // Duplicate + dbfull()->TEST_FlushMemTable(); + + TablePropertiesCollection ptc; + reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc); + ASSERT_EQ(1U, ptc.size()); + ASSERT_EQ(2U, ptc.begin()->second->num_entries); + ASSERT_EQ("1", FilesPerLevel()); + ASSERT_EQ("v3", Get("key1")); + ASSERT_EQ("v2", Get("key2")); +} + +namespace { +static std::string Key(int i) { + char buf[100]; + snprintf(buf, sizeof(buf), "key_______%06d", i); + return std::string(buf); +} +static std::string Uint64Key(uint64_t i) { + std::string str; + str.resize(8); + memcpy(&str[0], static_cast(&i), 8); + return str; +} +} // namespace. + +TEST_F(CuckooTableDBTest, Uint64Comparator) { + Options options = CurrentOptions(); + options.comparator = test::Uint64Comparator(); + Reopen(&options); + + ASSERT_OK(Put(Uint64Key(1), "v1")); + ASSERT_OK(Put(Uint64Key(2), "v2")); + ASSERT_OK(Put(Uint64Key(3), "v3")); + dbfull()->TEST_FlushMemTable(); + + ASSERT_EQ("v1", Get(Uint64Key(1))); + ASSERT_EQ("v2", Get(Uint64Key(2))); + ASSERT_EQ("v3", Get(Uint64Key(3))); + ASSERT_EQ("NOT_FOUND", Get(Uint64Key(4))); + + // Add more keys. + ASSERT_OK(Delete(Uint64Key(2))); // Delete. + dbfull()->TEST_FlushMemTable(); + ASSERT_OK(Put(Uint64Key(3), "v0")); // Update. + ASSERT_OK(Put(Uint64Key(4), "v4")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v1", Get(Uint64Key(1))); + ASSERT_EQ("NOT_FOUND", Get(Uint64Key(2))); + ASSERT_EQ("v0", Get(Uint64Key(3))); + ASSERT_EQ("v4", Get(Uint64Key(4))); +} + +TEST_F(CuckooTableDBTest, CompactionIntoMultipleFiles) { + // Create a big L0 file and check it compacts into multiple files in L1. + Options options = CurrentOptions(); + options.write_buffer_size = 270 << 10; + // Two SST files should be created, each containing 14 keys. + // Number of buckets will be 16. Total size ~156 KB. + options.target_file_size_base = 160 << 10; + Reopen(&options); + + // Write 28 values, each 10016 B ~ 10KB + for (int idx = 0; idx < 28; ++idx) { + ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + char(idx)))); + } + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ("1", FilesPerLevel()); + + dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow trivial move */); + ASSERT_EQ("0,2", FilesPerLevel()); + for (int idx = 0; idx < 28; ++idx) { + ASSERT_EQ(std::string(10000, 'a' + char(idx)), Get(Key(idx))); + } +} + +TEST_F(CuckooTableDBTest, SameKeyInsertedInTwoDifferentFilesAndCompacted) { + // Insert same key twice so that they go to different SST files. Then wait for + // compaction and check if the latest value is stored and old value removed. + Options options = CurrentOptions(); + options.write_buffer_size = 100 << 10; // 100KB + options.level0_file_num_compaction_trigger = 2; + Reopen(&options); + + // Write 11 values, each 10016 B + for (int idx = 0; idx < 11; ++idx) { + ASSERT_OK(Put(Key(idx), std::string(10000, 'a'))); + } + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ("1", FilesPerLevel()); + + // Generate one more file in level-0, and should trigger level-0 compaction + for (int idx = 0; idx < 11; ++idx) { + ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + char(idx)))); + } + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + + ASSERT_EQ("0,1", FilesPerLevel()); + for (int idx = 0; idx < 11; ++idx) { + ASSERT_EQ(std::string(10000, 'a' + char(idx)), Get(Key(idx))); + } +} + +TEST_F(CuckooTableDBTest, AdaptiveTable) { + Options options = CurrentOptions(); + + // Ensure options compatible with PlainTable + options.prefix_extractor.reset(NewCappedPrefixTransform(8)); + + // Write some keys using cuckoo table. + options.table_factory.reset(NewCuckooTableFactory()); + Reopen(&options); + + ASSERT_OK(Put("key1", "v1")); + ASSERT_OK(Put("key2", "v2")); + ASSERT_OK(Put("key3", "v3")); + dbfull()->TEST_FlushMemTable(); + + // Write some keys using plain table. + std::shared_ptr block_based_factory( + NewBlockBasedTableFactory()); + std::shared_ptr plain_table_factory( + NewPlainTableFactory()); + std::shared_ptr cuckoo_table_factory( + NewCuckooTableFactory()); + options.create_if_missing = false; + options.table_factory.reset(NewAdaptiveTableFactory( + plain_table_factory, block_based_factory, plain_table_factory, + cuckoo_table_factory)); + Reopen(&options); + ASSERT_OK(Put("key4", "v4")); + ASSERT_OK(Put("key1", "v5")); + dbfull()->TEST_FlushMemTable(); + + // Write some keys using block based table. + options.table_factory.reset(NewAdaptiveTableFactory( + block_based_factory, block_based_factory, plain_table_factory, + cuckoo_table_factory)); + Reopen(&options); + ASSERT_OK(Put("key5", "v6")); + ASSERT_OK(Put("key2", "v7")); + dbfull()->TEST_FlushMemTable(); + + ASSERT_EQ("v5", Get("key1")); + ASSERT_EQ("v7", Get("key2")); + ASSERT_EQ("v3", Get("key3")); + ASSERT_EQ("v4", Get("key4")); + ASSERT_EQ("v6", Get("key5")); +} +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + if (ROCKSDB_NAMESPACE::port::kLittleEndian) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); + } else { + fprintf(stderr, "SKIPPED as Cuckoo table doesn't support Big Endian\n"); + return 0; + } +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/db/db_basic_test.cc b/src/rocksdb/db/db_basic_test.cc new file mode 100644 index 000000000..7573a01b4 --- /dev/null +++ b/src/rocksdb/db/db_basic_test.cc @@ -0,0 +1,2545 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/utilities/debug.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/block_builder.h" +#include "test_util/fault_injection_test_env.h" +#if !defined(ROCKSDB_LITE) +#include "test_util/sync_point.h" +#endif + +namespace ROCKSDB_NAMESPACE { + +class DBBasicTest : public DBTestBase { + public: + DBBasicTest() : DBTestBase("/db_basic_test") {} +}; + +TEST_F(DBBasicTest, OpenWhenOpen) { + Options options = CurrentOptions(); + options.env = env_; + ROCKSDB_NAMESPACE::DB* db2 = nullptr; + ROCKSDB_NAMESPACE::Status s = DB::Open(options, dbname_, &db2); + + ASSERT_EQ(Status::Code::kIOError, s.code()); + ASSERT_EQ(Status::SubCode::kNone, s.subcode()); + ASSERT_TRUE(strstr(s.getState(), "lock ") != nullptr); + + delete db2; +} + +#ifndef ROCKSDB_LITE +TEST_F(DBBasicTest, ReadOnlyDB) { + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Put("bar", "v2")); + ASSERT_OK(Put("foo", "v3")); + Close(); + + auto options = CurrentOptions(); + assert(options.env == env_); + ASSERT_OK(ReadOnlyReopen(options)); + ASSERT_EQ("v3", Get("foo")); + ASSERT_EQ("v2", Get("bar")); + Iterator* iter = db_->NewIterator(ReadOptions()); + int count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + ++count; + } + ASSERT_EQ(count, 2); + delete iter; + Close(); + + // Reopen and flush memtable. + Reopen(options); + Flush(); + Close(); + // Now check keys in read only mode. + ASSERT_OK(ReadOnlyReopen(options)); + ASSERT_EQ("v3", Get("foo")); + ASSERT_EQ("v2", Get("bar")); + ASSERT_TRUE(db_->SyncWAL().IsNotSupported()); +} + +TEST_F(DBBasicTest, ReadOnlyDBWithWriteDBIdToManifestSet) { + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Put("bar", "v2")); + ASSERT_OK(Put("foo", "v3")); + Close(); + + auto options = CurrentOptions(); + options.write_dbid_to_manifest = true; + assert(options.env == env_); + ASSERT_OK(ReadOnlyReopen(options)); + std::string db_id1; + db_->GetDbIdentity(db_id1); + ASSERT_EQ("v3", Get("foo")); + ASSERT_EQ("v2", Get("bar")); + Iterator* iter = db_->NewIterator(ReadOptions()); + int count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + ++count; + } + ASSERT_EQ(count, 2); + delete iter; + Close(); + + // Reopen and flush memtable. + Reopen(options); + Flush(); + Close(); + // Now check keys in read only mode. + ASSERT_OK(ReadOnlyReopen(options)); + ASSERT_EQ("v3", Get("foo")); + ASSERT_EQ("v2", Get("bar")); + ASSERT_TRUE(db_->SyncWAL().IsNotSupported()); + std::string db_id2; + db_->GetDbIdentity(db_id2); + ASSERT_EQ(db_id1, db_id2); +} + +TEST_F(DBBasicTest, CompactedDB) { + const uint64_t kFileSize = 1 << 20; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.write_buffer_size = kFileSize; + options.target_file_size_base = kFileSize; + options.max_bytes_for_level_base = 1 << 30; + options.compression = kNoCompression; + Reopen(options); + // 1 L0 file, use CompactedDB if max_open_files = -1 + ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, '1'))); + Flush(); + Close(); + ASSERT_OK(ReadOnlyReopen(options)); + Status s = Put("new", "value"); + ASSERT_EQ(s.ToString(), + "Not implemented: Not supported operation in read only mode."); + ASSERT_EQ(DummyString(kFileSize / 2, '1'), Get("aaa")); + Close(); + options.max_open_files = -1; + ASSERT_OK(ReadOnlyReopen(options)); + s = Put("new", "value"); + ASSERT_EQ(s.ToString(), + "Not implemented: Not supported in compacted db mode."); + ASSERT_EQ(DummyString(kFileSize / 2, '1'), Get("aaa")); + Close(); + Reopen(options); + // Add more L0 files + ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, '2'))); + Flush(); + ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, 'a'))); + Flush(); + ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, 'b'))); + ASSERT_OK(Put("eee", DummyString(kFileSize / 2, 'e'))); + Flush(); + Close(); + + ASSERT_OK(ReadOnlyReopen(options)); + // Fallback to read-only DB + s = Put("new", "value"); + ASSERT_EQ(s.ToString(), + "Not implemented: Not supported operation in read only mode."); + Close(); + + // Full compaction + Reopen(options); + // Add more keys + ASSERT_OK(Put("fff", DummyString(kFileSize / 2, 'f'))); + ASSERT_OK(Put("hhh", DummyString(kFileSize / 2, 'h'))); + ASSERT_OK(Put("iii", DummyString(kFileSize / 2, 'i'))); + ASSERT_OK(Put("jjj", DummyString(kFileSize / 2, 'j'))); + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_EQ(3, NumTableFilesAtLevel(1)); + Close(); + + // CompactedDB + ASSERT_OK(ReadOnlyReopen(options)); + s = Put("new", "value"); + ASSERT_EQ(s.ToString(), + "Not implemented: Not supported in compacted db mode."); + ASSERT_EQ("NOT_FOUND", Get("abc")); + ASSERT_EQ(DummyString(kFileSize / 2, 'a'), Get("aaa")); + ASSERT_EQ(DummyString(kFileSize / 2, 'b'), Get("bbb")); + ASSERT_EQ("NOT_FOUND", Get("ccc")); + ASSERT_EQ(DummyString(kFileSize / 2, 'e'), Get("eee")); + ASSERT_EQ(DummyString(kFileSize / 2, 'f'), Get("fff")); + ASSERT_EQ("NOT_FOUND", Get("ggg")); + ASSERT_EQ(DummyString(kFileSize / 2, 'h'), Get("hhh")); + ASSERT_EQ(DummyString(kFileSize / 2, 'i'), Get("iii")); + ASSERT_EQ(DummyString(kFileSize / 2, 'j'), Get("jjj")); + ASSERT_EQ("NOT_FOUND", Get("kkk")); + + // MultiGet + std::vector values; + std::vector status_list = dbfull()->MultiGet( + ReadOptions(), + std::vector({Slice("aaa"), Slice("ccc"), Slice("eee"), + Slice("ggg"), Slice("iii"), Slice("kkk")}), + &values); + ASSERT_EQ(status_list.size(), static_cast(6)); + ASSERT_EQ(values.size(), static_cast(6)); + ASSERT_OK(status_list[0]); + ASSERT_EQ(DummyString(kFileSize / 2, 'a'), values[0]); + ASSERT_TRUE(status_list[1].IsNotFound()); + ASSERT_OK(status_list[2]); + ASSERT_EQ(DummyString(kFileSize / 2, 'e'), values[2]); + ASSERT_TRUE(status_list[3].IsNotFound()); + ASSERT_OK(status_list[4]); + ASSERT_EQ(DummyString(kFileSize / 2, 'i'), values[4]); + ASSERT_TRUE(status_list[5].IsNotFound()); + + Reopen(options); + // Add a key + ASSERT_OK(Put("fff", DummyString(kFileSize / 2, 'f'))); + Close(); + ASSERT_OK(ReadOnlyReopen(options)); + s = Put("new", "value"); + ASSERT_EQ(s.ToString(), + "Not implemented: Not supported operation in read only mode."); +} + +TEST_F(DBBasicTest, LevelLimitReopen) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + + const std::string value(1024 * 1024, ' '); + int i = 0; + while (NumTableFilesAtLevel(2, 1) == 0) { + ASSERT_OK(Put(1, Key(i++), value)); + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + } + + options.num_levels = 1; + options.max_bytes_for_level_multiplier_additional.resize(1, 1); + Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ(s.IsInvalidArgument(), true); + ASSERT_EQ(s.ToString(), + "Invalid argument: db has more levels than options.num_levels"); + + options.num_levels = 10; + options.max_bytes_for_level_multiplier_additional.resize(10, 1); + ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options)); +} +#endif // ROCKSDB_LITE + +TEST_F(DBBasicTest, PutDeleteGet) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_OK(Put(1, "foo", "v2")); + ASSERT_EQ("v2", Get(1, "foo")); + ASSERT_OK(Delete(1, "foo")); + ASSERT_EQ("NOT_FOUND", Get(1, "foo")); + } while (ChangeOptions()); +} + +TEST_F(DBBasicTest, PutSingleDeleteGet) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_OK(Put(1, "foo2", "v2")); + ASSERT_EQ("v2", Get(1, "foo2")); + ASSERT_OK(SingleDelete(1, "foo")); + ASSERT_EQ("NOT_FOUND", Get(1, "foo")); + // Ski FIFO and universal compaction because they do not apply to the test + // case. Skip MergePut because single delete does not get removed when it + // encounters a merge. + } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction | + kSkipMergePut)); +} + +TEST_F(DBBasicTest, EmptyFlush) { + // It is possible to produce empty flushes when using single deletes. Tests + // whether empty flushes cause issues. + do { + Random rnd(301); + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + CreateAndReopenWithCF({"pikachu"}, options); + + Put(1, "a", Slice()); + SingleDelete(1, "a"); + ASSERT_OK(Flush(1)); + + ASSERT_EQ("[ ]", AllEntriesFor("a", 1)); + // Skip FIFO and universal compaction as they do not apply to the test + // case. Skip MergePut because merges cannot be combined with single + // deletions. + } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction | + kSkipMergePut)); +} + +TEST_F(DBBasicTest, GetFromVersions) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Flush(1)); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("NOT_FOUND", Get(0, "foo")); + } while (ChangeOptions()); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBBasicTest, GetSnapshot) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override)); + // Try with both a short key and a long key + for (int i = 0; i < 2; i++) { + std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x'); + ASSERT_OK(Put(1, key, "v1")); + const Snapshot* s1 = db_->GetSnapshot(); + ASSERT_OK(Put(1, key, "v2")); + ASSERT_EQ("v2", Get(1, key)); + ASSERT_EQ("v1", Get(1, key, s1)); + ASSERT_OK(Flush(1)); + ASSERT_EQ("v2", Get(1, key)); + ASSERT_EQ("v1", Get(1, key, s1)); + db_->ReleaseSnapshot(s1); + } + } while (ChangeOptions()); +} +#endif // ROCKSDB_LITE + +TEST_F(DBBasicTest, CheckLock) { + do { + DB* localdb; + Options options = CurrentOptions(); + ASSERT_OK(TryReopen(options)); + + // second open should fail + ASSERT_TRUE(!(DB::Open(options, dbname_, &localdb)).ok()); + } while (ChangeCompactOptions()); +} + +TEST_F(DBBasicTest, FlushMultipleMemtable) { + do { + Options options = CurrentOptions(); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 3; + options.max_write_buffer_size_to_maintain = -1; + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1")); + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); + + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v1", Get(1, "bar")); + ASSERT_OK(Flush(1)); + } while (ChangeCompactOptions()); +} + +TEST_F(DBBasicTest, FlushEmptyColumnFamily) { + // Block flush thread and disable compaction thread + env_->SetBackgroundThreads(1, Env::HIGH); + env_->SetBackgroundThreads(1, Env::LOW); + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + test::SleepingBackgroundTask sleeping_task_high; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_task_high, Env::Priority::HIGH); + + Options options = CurrentOptions(); + // disable compaction + options.disable_auto_compactions = true; + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + options.max_write_buffer_number = 2; + options.min_write_buffer_number_to_merge = 1; + options.max_write_buffer_size_to_maintain = + static_cast(options.write_buffer_size); + CreateAndReopenWithCF({"pikachu"}, options); + + // Compaction can still go through even if no thread can flush the + // mem table. + ASSERT_OK(Flush(0)); + ASSERT_OK(Flush(1)); + + // Insert can go through + ASSERT_OK(dbfull()->Put(writeOpt, handles_[0], "foo", "v1")); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); + + ASSERT_EQ("v1", Get(0, "foo")); + ASSERT_EQ("v1", Get(1, "bar")); + + sleeping_task_high.WakeUp(); + sleeping_task_high.WaitUntilDone(); + + // Flush can still go through. + ASSERT_OK(Flush(0)); + ASSERT_OK(Flush(1)); + + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); +} + +TEST_F(DBBasicTest, FLUSH) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + SetPerfLevel(kEnableTime); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1")); + // this will now also flush the last 2 writes + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); + + get_perf_context()->Reset(); + Get(1, "foo"); + ASSERT_TRUE((int)get_perf_context()->get_from_output_files_time > 0); + ASSERT_EQ(2, (int)get_perf_context()->get_read_bytes); + + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v1", Get(1, "bar")); + + writeOpt.disableWAL = true; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2")); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2")); + ASSERT_OK(Flush(1)); + + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_EQ("v2", Get(1, "bar")); + get_perf_context()->Reset(); + ASSERT_EQ("v2", Get(1, "foo")); + ASSERT_TRUE((int)get_perf_context()->get_from_output_files_time > 0); + + writeOpt.disableWAL = false; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3")); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3")); + ASSERT_OK(Flush(1)); + + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + // 'foo' should be there because its put + // has WAL enabled. + ASSERT_EQ("v3", Get(1, "foo")); + ASSERT_EQ("v3", Get(1, "bar")); + + SetPerfLevel(kDisable); + } while (ChangeCompactOptions()); +} + +TEST_F(DBBasicTest, ManifestRollOver) { + do { + Options options; + options.max_manifest_file_size = 10; // 10 bytes + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, options); + { + ASSERT_OK(Put(1, "manifest_key1", std::string(1000, '1'))); + ASSERT_OK(Put(1, "manifest_key2", std::string(1000, '2'))); + ASSERT_OK(Put(1, "manifest_key3", std::string(1000, '3'))); + uint64_t manifest_before_flush = dbfull()->TEST_Current_Manifest_FileNo(); + ASSERT_OK(Flush(1)); // This should trigger LogAndApply. + uint64_t manifest_after_flush = dbfull()->TEST_Current_Manifest_FileNo(); + ASSERT_GT(manifest_after_flush, manifest_before_flush); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_GT(dbfull()->TEST_Current_Manifest_FileNo(), manifest_after_flush); + // check if a new manifest file got inserted or not. + ASSERT_EQ(std::string(1000, '1'), Get(1, "manifest_key1")); + ASSERT_EQ(std::string(1000, '2'), Get(1, "manifest_key2")); + ASSERT_EQ(std::string(1000, '3'), Get(1, "manifest_key3")); + } + } while (ChangeCompactOptions()); +} + +TEST_F(DBBasicTest, IdentityAcrossRestarts1) { + do { + std::string id1; + ASSERT_OK(db_->GetDbIdentity(id1)); + + Options options = CurrentOptions(); + Reopen(options); + std::string id2; + ASSERT_OK(db_->GetDbIdentity(id2)); + // id1 should match id2 because identity was not regenerated + ASSERT_EQ(id1.compare(id2), 0); + + std::string idfilename = IdentityFileName(dbname_); + ASSERT_OK(env_->DeleteFile(idfilename)); + Reopen(options); + std::string id3; + ASSERT_OK(db_->GetDbIdentity(id3)); + if (options.write_dbid_to_manifest) { + ASSERT_EQ(id1.compare(id3), 0); + } else { + // id1 should NOT match id3 because identity was regenerated + ASSERT_NE(id1.compare(id3), 0); + } + } while (ChangeCompactOptions()); +} + +TEST_F(DBBasicTest, IdentityAcrossRestarts2) { + do { + std::string id1; + ASSERT_OK(db_->GetDbIdentity(id1)); + + Options options = CurrentOptions(); + options.write_dbid_to_manifest = true; + Reopen(options); + std::string id2; + ASSERT_OK(db_->GetDbIdentity(id2)); + // id1 should match id2 because identity was not regenerated + ASSERT_EQ(id1.compare(id2), 0); + + std::string idfilename = IdentityFileName(dbname_); + ASSERT_OK(env_->DeleteFile(idfilename)); + Reopen(options); + std::string id3; + ASSERT_OK(db_->GetDbIdentity(id3)); + // id1 should NOT match id3 because identity was regenerated + ASSERT_EQ(id1, id3); + } while (ChangeCompactOptions()); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBBasicTest, Snapshot) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override)); + Put(0, "foo", "0v1"); + Put(1, "foo", "1v1"); + + const Snapshot* s1 = db_->GetSnapshot(); + ASSERT_EQ(1U, GetNumSnapshots()); + uint64_t time_snap1 = GetTimeOldestSnapshots(); + ASSERT_GT(time_snap1, 0U); + ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber()); + Put(0, "foo", "0v2"); + Put(1, "foo", "1v2"); + + env_->addon_time_.fetch_add(1); + + const Snapshot* s2 = db_->GetSnapshot(); + ASSERT_EQ(2U, GetNumSnapshots()); + ASSERT_EQ(time_snap1, GetTimeOldestSnapshots()); + ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber()); + Put(0, "foo", "0v3"); + Put(1, "foo", "1v3"); + + { + ManagedSnapshot s3(db_); + ASSERT_EQ(3U, GetNumSnapshots()); + ASSERT_EQ(time_snap1, GetTimeOldestSnapshots()); + ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber()); + + Put(0, "foo", "0v4"); + Put(1, "foo", "1v4"); + ASSERT_EQ("0v1", Get(0, "foo", s1)); + ASSERT_EQ("1v1", Get(1, "foo", s1)); + ASSERT_EQ("0v2", Get(0, "foo", s2)); + ASSERT_EQ("1v2", Get(1, "foo", s2)); + ASSERT_EQ("0v3", Get(0, "foo", s3.snapshot())); + ASSERT_EQ("1v3", Get(1, "foo", s3.snapshot())); + ASSERT_EQ("0v4", Get(0, "foo")); + ASSERT_EQ("1v4", Get(1, "foo")); + } + + ASSERT_EQ(2U, GetNumSnapshots()); + ASSERT_EQ(time_snap1, GetTimeOldestSnapshots()); + ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber()); + ASSERT_EQ("0v1", Get(0, "foo", s1)); + ASSERT_EQ("1v1", Get(1, "foo", s1)); + ASSERT_EQ("0v2", Get(0, "foo", s2)); + ASSERT_EQ("1v2", Get(1, "foo", s2)); + ASSERT_EQ("0v4", Get(0, "foo")); + ASSERT_EQ("1v4", Get(1, "foo")); + + db_->ReleaseSnapshot(s1); + ASSERT_EQ("0v2", Get(0, "foo", s2)); + ASSERT_EQ("1v2", Get(1, "foo", s2)); + ASSERT_EQ("0v4", Get(0, "foo")); + ASSERT_EQ("1v4", Get(1, "foo")); + ASSERT_EQ(1U, GetNumSnapshots()); + ASSERT_LT(time_snap1, GetTimeOldestSnapshots()); + ASSERT_EQ(GetSequenceOldestSnapshots(), s2->GetSequenceNumber()); + + db_->ReleaseSnapshot(s2); + ASSERT_EQ(0U, GetNumSnapshots()); + ASSERT_EQ(GetSequenceOldestSnapshots(), 0); + ASSERT_EQ("0v4", Get(0, "foo")); + ASSERT_EQ("1v4", Get(1, "foo")); + } while (ChangeOptions()); +} + +#endif // ROCKSDB_LITE + +TEST_F(DBBasicTest, CompactBetweenSnapshots) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; + do { + Options options = CurrentOptions(options_override); + options.disable_auto_compactions = true; + CreateAndReopenWithCF({"pikachu"}, options); + Random rnd(301); + FillLevels("a", "z", 1); + + Put(1, "foo", "first"); + const Snapshot* snapshot1 = db_->GetSnapshot(); + Put(1, "foo", "second"); + Put(1, "foo", "third"); + Put(1, "foo", "fourth"); + const Snapshot* snapshot2 = db_->GetSnapshot(); + Put(1, "foo", "fifth"); + Put(1, "foo", "sixth"); + + // All entries (including duplicates) exist + // before any compaction or flush is triggered. + ASSERT_EQ(AllEntriesFor("foo", 1), + "[ sixth, fifth, fourth, third, second, first ]"); + ASSERT_EQ("sixth", Get(1, "foo")); + ASSERT_EQ("fourth", Get(1, "foo", snapshot2)); + ASSERT_EQ("first", Get(1, "foo", snapshot1)); + + // After a flush, "second", "third" and "fifth" should + // be removed + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth, first ]"); + + // after we release the snapshot1, only two values left + db_->ReleaseSnapshot(snapshot1); + FillLevels("a", "z", 1); + dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr); + + // We have only one valid snapshot snapshot2. Since snapshot1 is + // not valid anymore, "first" should be removed by a compaction. + ASSERT_EQ("sixth", Get(1, "foo")); + ASSERT_EQ("fourth", Get(1, "foo", snapshot2)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth ]"); + + // after we release the snapshot2, only one value should be left + db_->ReleaseSnapshot(snapshot2); + FillLevels("a", "z", 1); + dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr); + ASSERT_EQ("sixth", Get(1, "foo")); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth ]"); + } while (ChangeOptions(kSkipFIFOCompaction)); +} + +TEST_F(DBBasicTest, DBOpen_Options) { + Options options = CurrentOptions(); + Close(); + Destroy(options); + + // Does not exist, and create_if_missing == false: error + DB* db = nullptr; + options.create_if_missing = false; + Status s = DB::Open(options, dbname_, &db); + ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr); + ASSERT_TRUE(db == nullptr); + + // Does not exist, and create_if_missing == true: OK + options.create_if_missing = true; + s = DB::Open(options, dbname_, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + + delete db; + db = nullptr; + + // Does exist, and error_if_exists == true: error + options.create_if_missing = false; + options.error_if_exists = true; + s = DB::Open(options, dbname_, &db); + ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr); + ASSERT_TRUE(db == nullptr); + + // Does exist, and error_if_exists == false: OK + options.create_if_missing = true; + options.error_if_exists = false; + s = DB::Open(options, dbname_, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + + delete db; + db = nullptr; +} + +TEST_F(DBBasicTest, CompactOnFlush) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; + do { + Options options = CurrentOptions(options_override); + options.disable_auto_compactions = true; + CreateAndReopenWithCF({"pikachu"}, options); + + Put(1, "foo", "v1"); + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v1 ]"); + + // Write two new keys + Put(1, "a", "begin"); + Put(1, "z", "end"); + Flush(1); + + // Case1: Delete followed by a put + Delete(1, "foo"); + Put(1, "foo", "v2"); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]"); + + // After the current memtable is flushed, the DEL should + // have been removed + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]"); + + dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]"); + + // Case 2: Delete followed by another delete + Delete(1, "foo"); + Delete(1, "foo"); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, DEL, v2 ]"); + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v2 ]"); + dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); + + // Case 3: Put followed by a delete + Put(1, "foo", "v3"); + Delete(1, "foo"); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v3 ]"); + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL ]"); + dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); + + // Case 4: Put followed by another Put + Put(1, "foo", "v4"); + Put(1, "foo", "v5"); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5, v4 ]"); + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]"); + dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]"); + + // clear database + Delete(1, "foo"); + dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); + + // Case 5: Put followed by snapshot followed by another Put + // Both puts should remain. + Put(1, "foo", "v6"); + const Snapshot* snapshot = db_->GetSnapshot(); + Put(1, "foo", "v7"); + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v7, v6 ]"); + db_->ReleaseSnapshot(snapshot); + + // clear database + Delete(1, "foo"); + dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); + + // Case 5: snapshot followed by a put followed by another Put + // Only the last put should remain. + const Snapshot* snapshot1 = db_->GetSnapshot(); + Put(1, "foo", "v8"); + Put(1, "foo", "v9"); + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v9 ]"); + db_->ReleaseSnapshot(snapshot1); + } while (ChangeCompactOptions()); +} + +TEST_F(DBBasicTest, FlushOneColumnFamily) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich", + "alyosha", "popovich"}, + options); + + ASSERT_OK(Put(0, "Default", "Default")); + ASSERT_OK(Put(1, "pikachu", "pikachu")); + ASSERT_OK(Put(2, "ilya", "ilya")); + ASSERT_OK(Put(3, "muromec", "muromec")); + ASSERT_OK(Put(4, "dobrynia", "dobrynia")); + ASSERT_OK(Put(5, "nikitich", "nikitich")); + ASSERT_OK(Put(6, "alyosha", "alyosha")); + ASSERT_OK(Put(7, "popovich", "popovich")); + + for (int i = 0; i < 8; ++i) { + Flush(i); + auto tables = ListTableFiles(env_, dbname_); + ASSERT_EQ(tables.size(), i + 1U); + } +} + +TEST_F(DBBasicTest, MultiGetSimple) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + SetPerfLevel(kEnableCount); + ASSERT_OK(Put(1, "k1", "v1")); + ASSERT_OK(Put(1, "k2", "v2")); + ASSERT_OK(Put(1, "k3", "v3")); + ASSERT_OK(Put(1, "k4", "v4")); + ASSERT_OK(Delete(1, "k4")); + ASSERT_OK(Put(1, "k5", "v5")); + ASSERT_OK(Delete(1, "no_key")); + + std::vector keys({"k1", "k2", "k3", "k4", "k5", "no_key"}); + + std::vector values(20, "Temporary data to be overwritten"); + std::vector cfs(keys.size(), handles_[1]); + + get_perf_context()->Reset(); + std::vector s = db_->MultiGet(ReadOptions(), cfs, keys, &values); + ASSERT_EQ(values.size(), keys.size()); + ASSERT_EQ(values[0], "v1"); + ASSERT_EQ(values[1], "v2"); + ASSERT_EQ(values[2], "v3"); + ASSERT_EQ(values[4], "v5"); + // four kv pairs * two bytes per value + ASSERT_EQ(8, (int)get_perf_context()->multiget_read_bytes); + + ASSERT_OK(s[0]); + ASSERT_OK(s[1]); + ASSERT_OK(s[2]); + ASSERT_TRUE(s[3].IsNotFound()); + ASSERT_OK(s[4]); + ASSERT_TRUE(s[5].IsNotFound()); + SetPerfLevel(kDisable); + } while (ChangeCompactOptions()); +} + +TEST_F(DBBasicTest, MultiGetEmpty) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + // Empty Key Set + std::vector keys; + std::vector values; + std::vector cfs; + std::vector s = db_->MultiGet(ReadOptions(), cfs, keys, &values); + ASSERT_EQ(s.size(), 0U); + + // Empty Database, Empty Key Set + Options options = CurrentOptions(); + options.create_if_missing = true; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + s = db_->MultiGet(ReadOptions(), cfs, keys, &values); + ASSERT_EQ(s.size(), 0U); + + // Empty Database, Search for Keys + keys.resize(2); + keys[0] = "a"; + keys[1] = "b"; + cfs.push_back(handles_[0]); + cfs.push_back(handles_[1]); + s = db_->MultiGet(ReadOptions(), cfs, keys, &values); + ASSERT_EQ(static_cast(s.size()), 2); + ASSERT_TRUE(s[0].IsNotFound() && s[1].IsNotFound()); + } while (ChangeCompactOptions()); +} + +TEST_F(DBBasicTest, ChecksumTest) { + BlockBasedTableOptions table_options; + Options options = CurrentOptions(); + // change when new checksum type added + int max_checksum = static_cast(kxxHash64); + const int kNumPerFile = 2; + + // generate one table with each type of checksum + for (int i = 0; i <= max_checksum; ++i) { + table_options.checksum = static_cast(i); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + for (int j = 0; j < kNumPerFile; ++j) { + ASSERT_OK(Put(Key(i * kNumPerFile + j), Key(i * kNumPerFile + j))); + } + ASSERT_OK(Flush()); + } + + // with each valid checksum type setting... + for (int i = 0; i <= max_checksum; ++i) { + table_options.checksum = static_cast(i); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + // verify every type of checksum (should be regardless of that setting) + for (int j = 0; j < (max_checksum + 1) * kNumPerFile; ++j) { + ASSERT_EQ(Key(j), Get(Key(j))); + } + } +} + +// On Windows you can have either memory mapped file or a file +// with unbuffered access. So this asserts and does not make +// sense to run +#ifndef OS_WIN +TEST_F(DBBasicTest, MmapAndBufferOptions) { + if (!IsMemoryMappedAccessSupported()) { + return; + } + Options options = CurrentOptions(); + + options.use_direct_reads = true; + options.allow_mmap_reads = true; + ASSERT_NOK(TryReopen(options)); + + // All other combinations are acceptable + options.use_direct_reads = false; + ASSERT_OK(TryReopen(options)); + + if (IsDirectIOSupported()) { + options.use_direct_reads = true; + options.allow_mmap_reads = false; + ASSERT_OK(TryReopen(options)); + } + + options.use_direct_reads = false; + ASSERT_OK(TryReopen(options)); +} +#endif + +class TestEnv : public EnvWrapper { + public: + explicit TestEnv(Env* base_env) : EnvWrapper(base_env), close_count(0) {} + + class TestLogger : public Logger { + public: + using Logger::Logv; + explicit TestLogger(TestEnv* env_ptr) : Logger() { env = env_ptr; } + ~TestLogger() override { + if (!closed_) { + CloseHelper(); + } + } + void Logv(const char* /*format*/, va_list /*ap*/) override {} + + protected: + Status CloseImpl() override { return CloseHelper(); } + + private: + Status CloseHelper() { + env->CloseCountInc(); + ; + return Status::IOError(); + } + TestEnv* env; + }; + + void CloseCountInc() { close_count++; } + + int GetCloseCount() { return close_count; } + + Status NewLogger(const std::string& /*fname*/, + std::shared_ptr* result) override { + result->reset(new TestLogger(this)); + return Status::OK(); + } + + private: + int close_count; +}; + +TEST_F(DBBasicTest, DBClose) { + Options options = GetDefaultOptions(); + std::string dbname = test::PerThreadDBPath("db_close_test"); + ASSERT_OK(DestroyDB(dbname, options)); + + DB* db = nullptr; + TestEnv* env = new TestEnv(env_); + std::unique_ptr local_env_guard(env); + options.create_if_missing = true; + options.env = env; + Status s = DB::Open(options, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + + s = db->Close(); + ASSERT_EQ(env->GetCloseCount(), 1); + ASSERT_EQ(s, Status::IOError()); + + delete db; + ASSERT_EQ(env->GetCloseCount(), 1); + + // Do not call DB::Close() and ensure our logger Close() still gets called + s = DB::Open(options, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + delete db; + ASSERT_EQ(env->GetCloseCount(), 2); + + // Provide our own logger and ensure DB::Close() does not close it + options.info_log.reset(new TestEnv::TestLogger(env)); + options.create_if_missing = false; + s = DB::Open(options, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + + s = db->Close(); + ASSERT_EQ(s, Status::OK()); + delete db; + ASSERT_EQ(env->GetCloseCount(), 2); + options.info_log.reset(); + ASSERT_EQ(env->GetCloseCount(), 3); +} + +TEST_F(DBBasicTest, DBCloseFlushError) { + std::unique_ptr fault_injection_env( + new FaultInjectionTestEnv(env_)); + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.manual_wal_flush = true; + options.write_buffer_size=100; + options.env = fault_injection_env.get(); + + Reopen(options); + ASSERT_OK(Put("key1", "value1")); + ASSERT_OK(Put("key2", "value2")); + ASSERT_OK(dbfull()->TEST_SwitchMemtable()); + ASSERT_OK(Put("key3", "value3")); + fault_injection_env->SetFilesystemActive(false); + Status s = dbfull()->Close(); + fault_injection_env->SetFilesystemActive(true); + ASSERT_NE(s, Status::OK()); + + Destroy(options); +} + +class DBMultiGetTestWithParam : public DBBasicTest, + public testing::WithParamInterface {}; + +TEST_P(DBMultiGetTestWithParam, MultiGetMultiCF) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich", + "alyosha", "popovich"}, + options); + // tuples + std::vector> cf_kv_vec; + static const int num_keys = 24; + cf_kv_vec.reserve(num_keys); + + for (int i = 0; i < num_keys; ++i) { + int cf = i / 3; + int cf_key = 1 % 3; + cf_kv_vec.emplace_back(std::make_tuple( + cf, "cf" + std::to_string(cf) + "_key_" + std::to_string(cf_key), + "cf" + std::to_string(cf) + "_val_" + std::to_string(cf_key))); + ASSERT_OK(Put(std::get<0>(cf_kv_vec[i]), std::get<1>(cf_kv_vec[i]), + std::get<2>(cf_kv_vec[i]))); + } + + int get_sv_count = 0; + ROCKSDB_NAMESPACE::DBImpl* db = reinterpret_cast(db_); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::MultiGet::AfterRefSV", [&](void* /*arg*/) { + if (++get_sv_count == 2) { + // After MultiGet refs a couple of CFs, flush all CFs so MultiGet + // is forced to repeat the process + for (int i = 0; i < num_keys; ++i) { + int cf = i / 3; + int cf_key = i % 8; + if (cf_key == 0) { + ASSERT_OK(Flush(cf)); + } + ASSERT_OK(Put(std::get<0>(cf_kv_vec[i]), std::get<1>(cf_kv_vec[i]), + std::get<2>(cf_kv_vec[i]) + "_2")); + } + } + if (get_sv_count == 11) { + for (int i = 0; i < 8; ++i) { + auto* cfd = reinterpret_cast( + db->GetColumnFamilyHandle(i)) + ->cfd(); + ASSERT_EQ(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse); + } + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + std::vector cfs; + std::vector keys; + std::vector values; + + for (int i = 0; i < num_keys; ++i) { + cfs.push_back(std::get<0>(cf_kv_vec[i])); + keys.push_back(std::get<1>(cf_kv_vec[i])); + } + + values = MultiGet(cfs, keys, nullptr, GetParam()); + ASSERT_EQ(values.size(), num_keys); + for (unsigned int j = 0; j < values.size(); ++j) { + ASSERT_EQ(values[j], std::get<2>(cf_kv_vec[j]) + "_2"); + } + + keys.clear(); + cfs.clear(); + cfs.push_back(std::get<0>(cf_kv_vec[0])); + keys.push_back(std::get<1>(cf_kv_vec[0])); + cfs.push_back(std::get<0>(cf_kv_vec[3])); + keys.push_back(std::get<1>(cf_kv_vec[3])); + cfs.push_back(std::get<0>(cf_kv_vec[4])); + keys.push_back(std::get<1>(cf_kv_vec[4])); + values = MultiGet(cfs, keys, nullptr, GetParam()); + ASSERT_EQ(values[0], std::get<2>(cf_kv_vec[0]) + "_2"); + ASSERT_EQ(values[1], std::get<2>(cf_kv_vec[3]) + "_2"); + ASSERT_EQ(values[2], std::get<2>(cf_kv_vec[4]) + "_2"); + + keys.clear(); + cfs.clear(); + cfs.push_back(std::get<0>(cf_kv_vec[7])); + keys.push_back(std::get<1>(cf_kv_vec[7])); + cfs.push_back(std::get<0>(cf_kv_vec[6])); + keys.push_back(std::get<1>(cf_kv_vec[6])); + cfs.push_back(std::get<0>(cf_kv_vec[1])); + keys.push_back(std::get<1>(cf_kv_vec[1])); + values = MultiGet(cfs, keys, nullptr, GetParam()); + ASSERT_EQ(values[0], std::get<2>(cf_kv_vec[7]) + "_2"); + ASSERT_EQ(values[1], std::get<2>(cf_kv_vec[6]) + "_2"); + ASSERT_EQ(values[2], std::get<2>(cf_kv_vec[1]) + "_2"); + + for (int cf = 0; cf < 8; ++cf) { + auto* cfd = reinterpret_cast( + reinterpret_cast(db_)->GetColumnFamilyHandle(cf)) + ->cfd(); + ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse); + ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVObsolete); + } +} + +TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFMutex) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich", + "alyosha", "popovich"}, + options); + + for (int i = 0; i < 8; ++i) { + ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key", + "cf" + std::to_string(i) + "_val")); + } + + int get_sv_count = 0; + int retries = 0; + bool last_try = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::MultiGet::LastTry", [&](void* /*arg*/) { + last_try = true; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::MultiGet::AfterRefSV", [&](void* /*arg*/) { + if (last_try) { + return; + } + if (++get_sv_count == 2) { + ++retries; + get_sv_count = 0; + for (int i = 0; i < 8; ++i) { + ASSERT_OK(Flush(i)); + ASSERT_OK(Put( + i, "cf" + std::to_string(i) + "_key", + "cf" + std::to_string(i) + "_val" + std::to_string(retries))); + } + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + std::vector cfs; + std::vector keys; + std::vector values; + + for (int i = 0; i < 8; ++i) { + cfs.push_back(i); + keys.push_back("cf" + std::to_string(i) + "_key"); + } + + values = MultiGet(cfs, keys, nullptr, GetParam()); + ASSERT_TRUE(last_try); + ASSERT_EQ(values.size(), 8); + for (unsigned int j = 0; j < values.size(); ++j) { + ASSERT_EQ(values[j], + "cf" + std::to_string(j) + "_val" + std::to_string(retries)); + } + for (int i = 0; i < 8; ++i) { + auto* cfd = reinterpret_cast( + reinterpret_cast(db_)->GetColumnFamilyHandle(i)) + ->cfd(); + ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse); + } +} + +TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFSnapshot) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich", + "alyosha", "popovich"}, + options); + + for (int i = 0; i < 8; ++i) { + ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key", + "cf" + std::to_string(i) + "_val")); + } + + int get_sv_count = 0; + ROCKSDB_NAMESPACE::DBImpl* db = reinterpret_cast(db_); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::MultiGet::AfterRefSV", [&](void* /*arg*/) { + if (++get_sv_count == 2) { + for (int i = 0; i < 8; ++i) { + ASSERT_OK(Flush(i)); + ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key", + "cf" + std::to_string(i) + "_val2")); + } + } + if (get_sv_count == 8) { + for (int i = 0; i < 8; ++i) { + auto* cfd = reinterpret_cast( + db->GetColumnFamilyHandle(i)) + ->cfd(); + ASSERT_TRUE( + (cfd->TEST_GetLocalSV()->Get() == SuperVersion::kSVInUse) || + (cfd->TEST_GetLocalSV()->Get() == SuperVersion::kSVObsolete)); + } + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + std::vector cfs; + std::vector keys; + std::vector values; + + for (int i = 0; i < 8; ++i) { + cfs.push_back(i); + keys.push_back("cf" + std::to_string(i) + "_key"); + } + + const Snapshot* snapshot = db_->GetSnapshot(); + values = MultiGet(cfs, keys, snapshot, GetParam()); + db_->ReleaseSnapshot(snapshot); + ASSERT_EQ(values.size(), 8); + for (unsigned int j = 0; j < values.size(); ++j) { + ASSERT_EQ(values[j], "cf" + std::to_string(j) + "_val"); + } + for (int i = 0; i < 8; ++i) { + auto* cfd = reinterpret_cast( + reinterpret_cast(db_)->GetColumnFamilyHandle(i)) + ->cfd(); + ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse); + } +} + +INSTANTIATE_TEST_CASE_P(DBMultiGetTestWithParam, DBMultiGetTestWithParam, + testing::Bool()); + +TEST_F(DBBasicTest, MultiGetBatchedSimpleUnsorted) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + SetPerfLevel(kEnableCount); + ASSERT_OK(Put(1, "k1", "v1")); + ASSERT_OK(Put(1, "k2", "v2")); + ASSERT_OK(Put(1, "k3", "v3")); + ASSERT_OK(Put(1, "k4", "v4")); + ASSERT_OK(Delete(1, "k4")); + ASSERT_OK(Put(1, "k5", "v5")); + ASSERT_OK(Delete(1, "no_key")); + + get_perf_context()->Reset(); + + std::vector keys({"no_key", "k5", "k4", "k3", "k2", "k1"}); + std::vector values(keys.size()); + std::vector cfs(keys.size(), handles_[1]); + std::vector s(keys.size()); + + db_->MultiGet(ReadOptions(), handles_[1], keys.size(), keys.data(), + values.data(), s.data(), false); + + ASSERT_EQ(values.size(), keys.size()); + ASSERT_EQ(std::string(values[5].data(), values[5].size()), "v1"); + ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v2"); + ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v3"); + ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v5"); + // four kv pairs * two bytes per value + ASSERT_EQ(8, (int)get_perf_context()->multiget_read_bytes); + + ASSERT_TRUE(s[0].IsNotFound()); + ASSERT_OK(s[1]); + ASSERT_TRUE(s[2].IsNotFound()); + ASSERT_OK(s[3]); + ASSERT_OK(s[4]); + ASSERT_OK(s[5]); + + SetPerfLevel(kDisable); + } while (ChangeCompactOptions()); +} + +TEST_F(DBBasicTest, MultiGetBatchedSimpleSorted) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + SetPerfLevel(kEnableCount); + ASSERT_OK(Put(1, "k1", "v1")); + ASSERT_OK(Put(1, "k2", "v2")); + ASSERT_OK(Put(1, "k3", "v3")); + ASSERT_OK(Put(1, "k4", "v4")); + ASSERT_OK(Delete(1, "k4")); + ASSERT_OK(Put(1, "k5", "v5")); + ASSERT_OK(Delete(1, "no_key")); + + get_perf_context()->Reset(); + + std::vector keys({"k1", "k2", "k3", "k4", "k5", "no_key"}); + std::vector values(keys.size()); + std::vector cfs(keys.size(), handles_[1]); + std::vector s(keys.size()); + + db_->MultiGet(ReadOptions(), handles_[1], keys.size(), keys.data(), + values.data(), s.data(), true); + + ASSERT_EQ(values.size(), keys.size()); + ASSERT_EQ(std::string(values[0].data(), values[0].size()), "v1"); + ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v2"); + ASSERT_EQ(std::string(values[2].data(), values[2].size()), "v3"); + ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v5"); + // four kv pairs * two bytes per value + ASSERT_EQ(8, (int)get_perf_context()->multiget_read_bytes); + + ASSERT_OK(s[0]); + ASSERT_OK(s[1]); + ASSERT_OK(s[2]); + ASSERT_TRUE(s[3].IsNotFound()); + ASSERT_OK(s[4]); + ASSERT_TRUE(s[5].IsNotFound()); + + SetPerfLevel(kDisable); + } while (ChangeCompactOptions()); +} + +TEST_F(DBBasicTest, MultiGetBatchedMultiLevel) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + Reopen(options); + int num_keys = 0; + + for (int i = 0; i < 128; ++i) { + ASSERT_OK(Put("key_" + std::to_string(i), "val_l2_" + std::to_string(i))); + num_keys++; + if (num_keys == 8) { + Flush(); + num_keys = 0; + } + } + if (num_keys > 0) { + Flush(); + num_keys = 0; + } + MoveFilesToLevel(2); + + for (int i = 0; i < 128; i += 3) { + ASSERT_OK(Put("key_" + std::to_string(i), "val_l1_" + std::to_string(i))); + num_keys++; + if (num_keys == 8) { + Flush(); + num_keys = 0; + } + } + if (num_keys > 0) { + Flush(); + num_keys = 0; + } + MoveFilesToLevel(1); + + for (int i = 0; i < 128; i += 5) { + ASSERT_OK(Put("key_" + std::to_string(i), "val_l0_" + std::to_string(i))); + num_keys++; + if (num_keys == 8) { + Flush(); + num_keys = 0; + } + } + if (num_keys > 0) { + Flush(); + num_keys = 0; + } + ASSERT_EQ(0, num_keys); + + for (int i = 0; i < 128; i += 9) { + ASSERT_OK(Put("key_" + std::to_string(i), "val_mem_" + std::to_string(i))); + } + + std::vector keys; + std::vector values; + + for (int i = 64; i < 80; ++i) { + keys.push_back("key_" + std::to_string(i)); + } + + values = MultiGet(keys, nullptr); + ASSERT_EQ(values.size(), 16); + for (unsigned int j = 0; j < values.size(); ++j) { + int key = j + 64; + if (key % 9 == 0) { + ASSERT_EQ(values[j], "val_mem_" + std::to_string(key)); + } else if (key % 5 == 0) { + ASSERT_EQ(values[j], "val_l0_" + std::to_string(key)); + } else if (key % 3 == 0) { + ASSERT_EQ(values[j], "val_l1_" + std::to_string(key)); + } else { + ASSERT_EQ(values[j], "val_l2_" + std::to_string(key)); + } + } +} + +TEST_F(DBBasicTest, MultiGetBatchedMultiLevelMerge) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + int num_keys = 0; + + for (int i = 0; i < 128; ++i) { + ASSERT_OK(Put("key_" + std::to_string(i), "val_l2_" + std::to_string(i))); + num_keys++; + if (num_keys == 8) { + Flush(); + num_keys = 0; + } + } + if (num_keys > 0) { + Flush(); + num_keys = 0; + } + MoveFilesToLevel(2); + + for (int i = 0; i < 128; i += 3) { + ASSERT_OK(Merge("key_" + std::to_string(i), "val_l1_" + std::to_string(i))); + num_keys++; + if (num_keys == 8) { + Flush(); + num_keys = 0; + } + } + if (num_keys > 0) { + Flush(); + num_keys = 0; + } + MoveFilesToLevel(1); + + for (int i = 0; i < 128; i += 5) { + ASSERT_OK(Merge("key_" + std::to_string(i), "val_l0_" + std::to_string(i))); + num_keys++; + if (num_keys == 8) { + Flush(); + num_keys = 0; + } + } + if (num_keys > 0) { + Flush(); + num_keys = 0; + } + ASSERT_EQ(0, num_keys); + + for (int i = 0; i < 128; i += 9) { + ASSERT_OK(Merge("key_" + std::to_string(i), "val_mem_" + std::to_string(i))); + } + + std::vector keys; + std::vector values; + + for (int i = 32; i < 80; ++i) { + keys.push_back("key_" + std::to_string(i)); + } + + values = MultiGet(keys, nullptr); + ASSERT_EQ(values.size(), keys.size()); + for (unsigned int j = 0; j < 48; ++j) { + int key = j + 32; + std::string value; + value.append("val_l2_" + std::to_string(key)); + if (key % 3 == 0) { + value.append(","); + value.append("val_l1_" + std::to_string(key)); + } + if (key % 5 == 0) { + value.append(","); + value.append("val_l0_" + std::to_string(key)); + } + if (key % 9 == 0) { + value.append(","); + value.append("val_mem_" + std::to_string(key)); + } + ASSERT_EQ(values[j], value); + } +} + +// Test class for batched MultiGet with prefix extractor +// Param bool - If true, use partitioned filters +// If false, use full filter block +class MultiGetPrefixExtractorTest : public DBBasicTest, + public ::testing::WithParamInterface { +}; + +TEST_P(MultiGetPrefixExtractorTest, Batched) { + Options options = CurrentOptions(); + options.prefix_extractor.reset(NewFixedPrefixTransform(2)); + options.memtable_prefix_bloom_size_ratio = 10; + BlockBasedTableOptions bbto; + if (GetParam()) { + bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + bbto.partition_filters = true; + } + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.whole_key_filtering = false; + bbto.cache_index_and_filter_blocks = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + + SetPerfLevel(kEnableCount); + get_perf_context()->Reset(); + + // First key is not in the prefix_extractor domain + ASSERT_OK(Put("k", "v0")); + ASSERT_OK(Put("kk1", "v1")); + ASSERT_OK(Put("kk2", "v2")); + ASSERT_OK(Put("kk3", "v3")); + ASSERT_OK(Put("kk4", "v4")); + std::vector mem_keys( + {"k", "kk1", "kk2", "kk3", "kk4", "rofl", "lmho"}); + std::vector inmem_values; + inmem_values = MultiGet(mem_keys, nullptr); + ASSERT_EQ(inmem_values[0], "v0"); + ASSERT_EQ(inmem_values[1], "v1"); + ASSERT_EQ(inmem_values[2], "v2"); + ASSERT_EQ(inmem_values[3], "v3"); + ASSERT_EQ(inmem_values[4], "v4"); + ASSERT_EQ(get_perf_context()->bloom_memtable_miss_count, 2); + ASSERT_EQ(get_perf_context()->bloom_memtable_hit_count, 5); + ASSERT_OK(Flush()); + + std::vector keys({"k", "kk1", "kk2", "kk3", "kk4"}); + std::vector values; + get_perf_context()->Reset(); + values = MultiGet(keys, nullptr); + ASSERT_EQ(values[0], "v0"); + ASSERT_EQ(values[1], "v1"); + ASSERT_EQ(values[2], "v2"); + ASSERT_EQ(values[3], "v3"); + ASSERT_EQ(values[4], "v4"); + // Filter hits for 4 in-domain keys + ASSERT_EQ(get_perf_context()->bloom_sst_hit_count, 4); +} + +INSTANTIATE_TEST_CASE_P(MultiGetPrefix, MultiGetPrefixExtractorTest, + ::testing::Bool()); + +#ifndef ROCKSDB_LITE +class DBMultiGetRowCacheTest : public DBBasicTest, + public ::testing::WithParamInterface {}; + +TEST_P(DBMultiGetRowCacheTest, MultiGetBatched) { + do { + option_config_ = kRowCache; + Options options = CurrentOptions(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + CreateAndReopenWithCF({"pikachu"}, options); + SetPerfLevel(kEnableCount); + ASSERT_OK(Put(1, "k1", "v1")); + ASSERT_OK(Put(1, "k2", "v2")); + ASSERT_OK(Put(1, "k3", "v3")); + ASSERT_OK(Put(1, "k4", "v4")); + Flush(1); + ASSERT_OK(Put(1, "k5", "v5")); + const Snapshot* snap1 = dbfull()->GetSnapshot(); + ASSERT_OK(Delete(1, "k4")); + Flush(1); + const Snapshot* snap2 = dbfull()->GetSnapshot(); + + get_perf_context()->Reset(); + + std::vector keys({"no_key", "k5", "k4", "k3", "k1"}); + std::vector values(keys.size()); + std::vector cfs(keys.size(), handles_[1]); + std::vector s(keys.size()); + + ReadOptions ro; + bool use_snapshots = GetParam(); + if (use_snapshots) { + ro.snapshot = snap2; + } + db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(), + s.data(), false); + + ASSERT_EQ(values.size(), keys.size()); + ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v1"); + ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v3"); + ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v5"); + // four kv pairs * two bytes per value + ASSERT_EQ(6, (int)get_perf_context()->multiget_read_bytes); + + ASSERT_TRUE(s[0].IsNotFound()); + ASSERT_OK(s[1]); + ASSERT_TRUE(s[2].IsNotFound()); + ASSERT_OK(s[3]); + ASSERT_OK(s[4]); + + // Call MultiGet() again with some intersection with the previous set of + // keys. Those should already be in the row cache. + keys.assign({"no_key", "k5", "k3", "k2"}); + for (size_t i = 0; i < keys.size(); ++i) { + values[i].Reset(); + s[i] = Status::OK(); + } + get_perf_context()->Reset(); + + if (use_snapshots) { + ro.snapshot = snap1; + } + db_->MultiGet(ReadOptions(), handles_[1], keys.size(), keys.data(), + values.data(), s.data(), false); + + ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v2"); + ASSERT_EQ(std::string(values[2].data(), values[2].size()), "v3"); + ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v5"); + // four kv pairs * two bytes per value + ASSERT_EQ(6, (int)get_perf_context()->multiget_read_bytes); + + ASSERT_TRUE(s[0].IsNotFound()); + ASSERT_OK(s[1]); + ASSERT_OK(s[2]); + ASSERT_OK(s[3]); + if (use_snapshots) { + // Only reads from the first SST file would have been cached, since + // snapshot seq no is > fd.largest_seqno + ASSERT_EQ(1, TestGetTickerCount(options, ROW_CACHE_HIT)); + } else { + ASSERT_EQ(2, TestGetTickerCount(options, ROW_CACHE_HIT)); + } + + SetPerfLevel(kDisable); + dbfull()->ReleaseSnapshot(snap1); + dbfull()->ReleaseSnapshot(snap2); + } while (ChangeCompactOptions()); +} + +INSTANTIATE_TEST_CASE_P(DBMultiGetRowCacheTest, DBMultiGetRowCacheTest, + testing::Values(true, false)); + +TEST_F(DBBasicTest, GetAllKeyVersions) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.disable_auto_compactions = true; + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_EQ(2, handles_.size()); + const size_t kNumInserts = 4; + const size_t kNumDeletes = 4; + const size_t kNumUpdates = 4; + + // Check default column family + for (size_t i = 0; i != kNumInserts; ++i) { + ASSERT_OK(Put(std::to_string(i), "value")); + } + for (size_t i = 0; i != kNumUpdates; ++i) { + ASSERT_OK(Put(std::to_string(i), "value1")); + } + for (size_t i = 0; i != kNumDeletes; ++i) { + ASSERT_OK(Delete(std::to_string(i))); + } + std::vector key_versions; + ASSERT_OK(ROCKSDB_NAMESPACE::GetAllKeyVersions( + db_, Slice(), Slice(), std::numeric_limits::max(), + &key_versions)); + ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size()); + ASSERT_OK(ROCKSDB_NAMESPACE::GetAllKeyVersions( + db_, handles_[0], Slice(), Slice(), std::numeric_limits::max(), + &key_versions)); + ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size()); + + // Check non-default column family + for (size_t i = 0; i != kNumInserts - 1; ++i) { + ASSERT_OK(Put(1, std::to_string(i), "value")); + } + for (size_t i = 0; i != kNumUpdates - 1; ++i) { + ASSERT_OK(Put(1, std::to_string(i), "value1")); + } + for (size_t i = 0; i != kNumDeletes - 1; ++i) { + ASSERT_OK(Delete(1, std::to_string(i))); + } + ASSERT_OK(ROCKSDB_NAMESPACE::GetAllKeyVersions( + db_, handles_[1], Slice(), Slice(), std::numeric_limits::max(), + &key_versions)); + ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates - 3, key_versions.size()); +} +#endif // !ROCKSDB_LITE + +TEST_F(DBBasicTest, MultiGetIOBufferOverrun) { + Options options = CurrentOptions(); + Random rnd(301); + BlockBasedTableOptions table_options; + table_options.pin_l0_filter_and_index_blocks_in_cache = true; + table_options.block_size = 16 * 1024; + assert(table_options.block_size > + BlockBasedTable::kMultiGetReadStackBufSize); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + Reopen(options); + + std::string zero_str(128, '\0'); + for (int i = 0; i < 100; ++i) { + // Make the value compressible. A purely random string doesn't compress + // and the resultant data block will not be compressed + std::string value(RandomString(&rnd, 128) + zero_str); + assert(Put(Key(i), value) == Status::OK()); + } + Flush(); + + std::vector key_data(10); + std::vector keys; + // We cannot resize a PinnableSlice vector, so just set initial size to + // largest we think we will need + std::vector values(10); + std::vector statuses; + ReadOptions ro; + + // Warm up the cache first + key_data.emplace_back(Key(0)); + keys.emplace_back(Slice(key_data.back())); + key_data.emplace_back(Key(50)); + keys.emplace_back(Slice(key_data.back())); + statuses.resize(keys.size()); + + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data(), true); +} + +class DBBasicTestWithParallelIO + : public DBTestBase, + public testing::WithParamInterface> { + public: + DBBasicTestWithParallelIO() : DBTestBase("/db_basic_test_with_parallel_io") { + bool compressed_cache = std::get<0>(GetParam()); + bool uncompressed_cache = std::get<1>(GetParam()); + compression_enabled_ = std::get<2>(GetParam()); + fill_cache_ = std::get<3>(GetParam()); + + if (compressed_cache) { + std::shared_ptr cache = NewLRUCache(1048576); + compressed_cache_ = std::make_shared(cache); + } + if (uncompressed_cache) { + std::shared_ptr cache = NewLRUCache(1048576); + uncompressed_cache_ = std::make_shared(cache); + } + + env_->count_random_reads_ = true; + + Options options = CurrentOptions(); + Random rnd(301); + BlockBasedTableOptions table_options; + +#ifndef ROCKSDB_LITE + if (compression_enabled_) { + std::vector compression_types; + compression_types = GetSupportedCompressions(); + // Not every platform may have compression libraries available, so + // dynamically pick based on what's available + if (compression_types.size() == 0) { + compression_enabled_ = false; + } else { + options.compression = compression_types[0]; + } + } +#else + // GetSupportedCompressions() is not available in LITE build + if (!Snappy_Supported()) { + compression_enabled_ = false; + } +#endif //ROCKSDB_LITE + + table_options.block_cache = uncompressed_cache_; + if (table_options.block_cache == nullptr) { + table_options.no_block_cache = true; + } else { + table_options.pin_l0_filter_and_index_blocks_in_cache = true; + } + table_options.block_cache_compressed = compressed_cache_; + table_options.flush_block_policy_factory.reset( + new MyFlushBlockPolicyFactory()); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + if (!compression_enabled_) { + options.compression = kNoCompression; + } + Reopen(options); + + std::string zero_str(128, '\0'); + for (int i = 0; i < 100; ++i) { + // Make the value compressible. A purely random string doesn't compress + // and the resultant data block will not be compressed + values_.emplace_back(RandomString(&rnd, 128) + zero_str); + assert(Put(Key(i), values_[i]) == Status::OK()); + } + Flush(); + + for (int i = 0; i < 100; ++i) { + // block cannot gain space by compression + uncompressable_values_.emplace_back(RandomString(&rnd, 256) + '\0'); + std::string tmp_key = "a" + Key(i); + assert(Put(tmp_key, uncompressable_values_[i]) == Status::OK()); + } + Flush(); + } + + bool CheckValue(int i, const std::string& value) { + if (values_[i].compare(value) == 0) { + return true; + } + return false; + } + + bool CheckUncompressableValue(int i, const std::string& value) { + if (uncompressable_values_[i].compare(value) == 0) { + return true; + } + return false; + } + + int num_lookups() { return uncompressed_cache_->num_lookups(); } + int num_found() { return uncompressed_cache_->num_found(); } + int num_inserts() { return uncompressed_cache_->num_inserts(); } + + int num_lookups_compressed() { return compressed_cache_->num_lookups(); } + int num_found_compressed() { return compressed_cache_->num_found(); } + int num_inserts_compressed() { return compressed_cache_->num_inserts(); } + + bool fill_cache() { return fill_cache_; } + bool compression_enabled() { return compression_enabled_; } + bool has_compressed_cache() { return compressed_cache_ != nullptr; } + bool has_uncompressed_cache() { return uncompressed_cache_ != nullptr; } + + static void SetUpTestCase() {} + static void TearDownTestCase() {} + + private: + class MyFlushBlockPolicyFactory : public FlushBlockPolicyFactory { + public: + MyFlushBlockPolicyFactory() {} + + virtual const char* Name() const override { + return "MyFlushBlockPolicyFactory"; + } + + virtual FlushBlockPolicy* NewFlushBlockPolicy( + const BlockBasedTableOptions& /*table_options*/, + const BlockBuilder& data_block_builder) const override { + return new MyFlushBlockPolicy(data_block_builder); + } + }; + + class MyFlushBlockPolicy : public FlushBlockPolicy { + public: + explicit MyFlushBlockPolicy(const BlockBuilder& data_block_builder) + : num_keys_(0), data_block_builder_(data_block_builder) {} + + bool Update(const Slice& /*key*/, const Slice& /*value*/) override { + if (data_block_builder_.empty()) { + // First key in this block + num_keys_ = 1; + return false; + } + // Flush every 10 keys + if (num_keys_ == 10) { + num_keys_ = 1; + return true; + } + num_keys_++; + return false; + } + + private: + int num_keys_; + const BlockBuilder& data_block_builder_; + }; + + class MyBlockCache : public Cache { + public: + explicit MyBlockCache(std::shared_ptr& target) + : target_(target), num_lookups_(0), num_found_(0), num_inserts_(0) {} + + virtual const char* Name() const override { return "MyBlockCache"; } + + virtual Status Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value), + Handle** handle = nullptr, + Priority priority = Priority::LOW) override { + num_inserts_++; + return target_->Insert(key, value, charge, deleter, handle, priority); + } + + virtual Handle* Lookup(const Slice& key, + Statistics* stats = nullptr) override { + num_lookups_++; + Handle* handle = target_->Lookup(key, stats); + if (handle != nullptr) { + num_found_++; + } + return handle; + } + + virtual bool Ref(Handle* handle) override { return target_->Ref(handle); } + + virtual bool Release(Handle* handle, bool force_erase = false) override { + return target_->Release(handle, force_erase); + } + + virtual void* Value(Handle* handle) override { + return target_->Value(handle); + } + + virtual void Erase(const Slice& key) override { target_->Erase(key); } + virtual uint64_t NewId() override { return target_->NewId(); } + + virtual void SetCapacity(size_t capacity) override { + target_->SetCapacity(capacity); + } + + virtual void SetStrictCapacityLimit(bool strict_capacity_limit) override { + target_->SetStrictCapacityLimit(strict_capacity_limit); + } + + virtual bool HasStrictCapacityLimit() const override { + return target_->HasStrictCapacityLimit(); + } + + virtual size_t GetCapacity() const override { + return target_->GetCapacity(); + } + + virtual size_t GetUsage() const override { return target_->GetUsage(); } + + virtual size_t GetUsage(Handle* handle) const override { + return target_->GetUsage(handle); + } + + virtual size_t GetPinnedUsage() const override { + return target_->GetPinnedUsage(); + } + + virtual size_t GetCharge(Handle* /*handle*/) const override { return 0; } + + virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t), + bool thread_safe) override { + return target_->ApplyToAllCacheEntries(callback, thread_safe); + } + + virtual void EraseUnRefEntries() override { + return target_->EraseUnRefEntries(); + } + + int num_lookups() { return num_lookups_; } + + int num_found() { return num_found_; } + + int num_inserts() { return num_inserts_; } + + private: + std::shared_ptr target_; + int num_lookups_; + int num_found_; + int num_inserts_; + }; + + std::shared_ptr compressed_cache_; + std::shared_ptr uncompressed_cache_; + bool compression_enabled_; + std::vector values_; + std::vector uncompressable_values_; + bool fill_cache_; +}; + +TEST_P(DBBasicTestWithParallelIO, MultiGet) { + std::vector key_data(10); + std::vector keys; + // We cannot resize a PinnableSlice vector, so just set initial size to + // largest we think we will need + std::vector values(10); + std::vector statuses; + ReadOptions ro; + ro.fill_cache = fill_cache(); + + // Warm up the cache first + key_data.emplace_back(Key(0)); + keys.emplace_back(Slice(key_data.back())); + key_data.emplace_back(Key(50)); + keys.emplace_back(Slice(key_data.back())); + statuses.resize(keys.size()); + + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data(), true); + ASSERT_TRUE(CheckValue(0, values[0].ToString())); + ASSERT_TRUE(CheckValue(50, values[1].ToString())); + + int random_reads = env_->random_read_counter_.Read(); + key_data[0] = Key(1); + key_data[1] = Key(51); + keys[0] = Slice(key_data[0]); + keys[1] = Slice(key_data[1]); + values[0].Reset(); + values[1].Reset(); + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data(), true); + ASSERT_TRUE(CheckValue(1, values[0].ToString())); + ASSERT_TRUE(CheckValue(51, values[1].ToString())); + + bool read_from_cache = false; + if (fill_cache()) { + if (has_uncompressed_cache()) { + read_from_cache = true; + } else if (has_compressed_cache() && compression_enabled()) { + read_from_cache = true; + } + } + + int expected_reads = random_reads + (read_from_cache ? 0 : 2); + ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads); + + keys.resize(10); + statuses.resize(10); + std::vector key_ints{1, 2, 15, 16, 55, 81, 82, 83, 84, 85}; + for (size_t i = 0; i < key_ints.size(); ++i) { + key_data[i] = Key(key_ints[i]); + keys[i] = Slice(key_data[i]); + statuses[i] = Status::OK(); + values[i].Reset(); + } + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data(), true); + for (size_t i = 0; i < key_ints.size(); ++i) { + ASSERT_OK(statuses[i]); + ASSERT_TRUE(CheckValue(key_ints[i], values[i].ToString())); + } + if (compression_enabled() && !has_compressed_cache()) { + expected_reads += (read_from_cache ? 2 : 3); + } else { + expected_reads += (read_from_cache ? 2 : 4); + } + ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads); + + keys.resize(10); + statuses.resize(10); + std::vector key_uncmp{1, 2, 15, 16, 55, 81, 82, 83, 84, 85}; + for (size_t i = 0; i < key_uncmp.size(); ++i) { + key_data[i] = "a" + Key(key_uncmp[i]); + keys[i] = Slice(key_data[i]); + statuses[i] = Status::OK(); + values[i].Reset(); + } + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data(), true); + for (size_t i = 0; i < key_uncmp.size(); ++i) { + ASSERT_OK(statuses[i]); + ASSERT_TRUE(CheckUncompressableValue(key_uncmp[i], values[i].ToString())); + } + if (compression_enabled() && !has_compressed_cache()) { + expected_reads += (read_from_cache ? 3 : 3); + } else { + expected_reads += (read_from_cache ? 4 : 4); + } + ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads); + + keys.resize(5); + statuses.resize(5); + std::vector key_tr{1, 2, 15, 16, 55}; + for (size_t i = 0; i < key_tr.size(); ++i) { + key_data[i] = "a" + Key(key_tr[i]); + keys[i] = Slice(key_data[i]); + statuses[i] = Status::OK(); + values[i].Reset(); + } + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data(), true); + for (size_t i = 0; i < key_tr.size(); ++i) { + ASSERT_OK(statuses[i]); + ASSERT_TRUE(CheckUncompressableValue(key_tr[i], values[i].ToString())); + } + if (compression_enabled() && !has_compressed_cache()) { + expected_reads += (read_from_cache ? 0 : 2); + ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads); + } else { + if (has_uncompressed_cache()) { + expected_reads += (read_from_cache ? 0 : 3); + ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads); + } else { + // A rare case, even we enable the block compression but some of data + // blocks are not compressed due to content. If user only enable the + // compressed cache, the uncompressed blocks will not tbe cached, and + // block reads will be triggered. The number of reads is related to + // the compression algorithm. + ASSERT_TRUE(env_->random_read_counter_.Read() >= expected_reads); + } + } +} + +TEST_P(DBBasicTestWithParallelIO, MultiGetWithChecksumMismatch) { + std::vector key_data(10); + std::vector keys; + // We cannot resize a PinnableSlice vector, so just set initial size to + // largest we think we will need + std::vector values(10); + std::vector statuses; + int read_count = 0; + ReadOptions ro; + ro.fill_cache = fill_cache(); + + SyncPoint::GetInstance()->SetCallBack( + "RetrieveMultipleBlocks:VerifyChecksum", [&](void *status) { + Status* s = static_cast(status); + read_count++; + if (read_count == 2) { + *s = Status::Corruption(); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + // Warm up the cache first + key_data.emplace_back(Key(0)); + keys.emplace_back(Slice(key_data.back())); + key_data.emplace_back(Key(50)); + keys.emplace_back(Slice(key_data.back())); + statuses.resize(keys.size()); + + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data(), true); + ASSERT_TRUE(CheckValue(0, values[0].ToString())); + //ASSERT_TRUE(CheckValue(50, values[1].ToString())); + ASSERT_EQ(statuses[0], Status::OK()); + ASSERT_EQ(statuses[1], Status::Corruption()); + + SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_P(DBBasicTestWithParallelIO, MultiGetWithMissingFile) { + std::vector key_data(10); + std::vector keys; + // We cannot resize a PinnableSlice vector, so just set initial size to + // largest we think we will need + std::vector values(10); + std::vector statuses; + ReadOptions ro; + ro.fill_cache = fill_cache(); + + SyncPoint::GetInstance()->SetCallBack( + "TableCache::MultiGet:FindTable", [&](void *status) { + Status* s = static_cast(status); + *s = Status::IOError(); + }); + // DB open will create table readers unless we reduce the table cache + // capacity. + // SanitizeOptions will set max_open_files to minimum of 20. Table cache + // is allocated with max_open_files - 10 as capacity. So override + // max_open_files to 11 so table cache capacity will become 1. This will + // prevent file open during DB open and force the file to be opened + // during MultiGet + SyncPoint::GetInstance()->SetCallBack( + "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void *arg) { + int* max_open_files = (int*)arg; + *max_open_files = 11; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + Reopen(CurrentOptions()); + + // Warm up the cache first + key_data.emplace_back(Key(0)); + keys.emplace_back(Slice(key_data.back())); + key_data.emplace_back(Key(50)); + keys.emplace_back(Slice(key_data.back())); + statuses.resize(keys.size()); + + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data(), true); + ASSERT_EQ(statuses[0], Status::IOError()); + ASSERT_EQ(statuses[1], Status::IOError()); + + SyncPoint::GetInstance()->DisableProcessing(); +} + +INSTANTIATE_TEST_CASE_P( + ParallelIO, DBBasicTestWithParallelIO, + // Params are as follows - + // Param 0 - Compressed cache enabled + // Param 1 - Uncompressed cache enabled + // Param 2 - Data compression enabled + // Param 3 - ReadOptions::fill_cache + ::testing::Combine(::testing::Bool(), ::testing::Bool(), + ::testing::Bool(), ::testing::Bool())); + +class DBBasicTestWithTimestampBase : public DBTestBase { + public: + explicit DBBasicTestWithTimestampBase(const std::string& dbname) + : DBTestBase(dbname) {} + + protected: + class TestComparatorBase : public Comparator { + public: + explicit TestComparatorBase(size_t ts_sz) : Comparator(ts_sz) {} + + const char* Name() const override { return "TestComparator"; } + + void FindShortSuccessor(std::string*) const override {} + + void FindShortestSeparator(std::string*, const Slice&) const override {} + + int Compare(const Slice& a, const Slice& b) const override { + int r = CompareWithoutTimestamp(a, b); + if (r != 0 || 0 == timestamp_size()) { + return r; + } + return CompareTimestamp( + Slice(a.data() + a.size() - timestamp_size(), timestamp_size()), + Slice(b.data() + b.size() - timestamp_size(), timestamp_size())); + } + + virtual int CompareImpl(const Slice& a, const Slice& b) const = 0; + + int CompareWithoutTimestamp(const Slice& a, const Slice& b) const override { + assert(a.size() >= timestamp_size()); + assert(b.size() >= timestamp_size()); + Slice k1 = StripTimestampFromUserKey(a, timestamp_size()); + Slice k2 = StripTimestampFromUserKey(b, timestamp_size()); + + return CompareImpl(k1, k2); + } + + int CompareTimestamp(const Slice& ts1, const Slice& ts2) const override { + if (!ts1.data() && !ts2.data()) { + return 0; + } else if (ts1.data() && !ts2.data()) { + return 1; + } else if (!ts1.data() && ts2.data()) { + return -1; + } + assert(ts1.size() == ts2.size()); + uint64_t low1 = 0; + uint64_t low2 = 0; + uint64_t high1 = 0; + uint64_t high2 = 0; + auto* ptr1 = const_cast(&ts1); + auto* ptr2 = const_cast(&ts2); + if (!GetFixed64(ptr1, &low1) || !GetFixed64(ptr1, &high1) || + !GetFixed64(ptr2, &low2) || !GetFixed64(ptr2, &high2)) { + assert(false); + } + if (high1 < high2) { + return 1; + } else if (high1 > high2) { + return -1; + } + if (low1 < low2) { + return 1; + } else if (low1 > low2) { + return -1; + } + return 0; + } + }; + + Slice EncodeTimestamp(uint64_t low, uint64_t high, std::string* ts) { + assert(nullptr != ts); + ts->clear(); + PutFixed64(ts, low); + PutFixed64(ts, high); + assert(ts->size() == sizeof(low) + sizeof(high)); + return Slice(*ts); + } +}; + +class DBBasicTestWithTimestamp : public DBBasicTestWithTimestampBase { + public: + DBBasicTestWithTimestamp() + : DBBasicTestWithTimestampBase("/db_basic_test_with_timestamp") {} + + protected: + class TestComparator : public TestComparatorBase { + public: + const int kKeyPrefixLength = + 3; // 3: length of "key" in generated keys ("key" + std::to_string(j)) + explicit TestComparator(size_t ts_sz) : TestComparatorBase(ts_sz) {} + + int CompareImpl(const Slice& a, const Slice& b) const override { + int n1 = atoi( + std::string(a.data() + kKeyPrefixLength, a.size() - kKeyPrefixLength) + .c_str()); + int n2 = atoi( + std::string(b.data() + kKeyPrefixLength, b.size() - kKeyPrefixLength) + .c_str()); + return (n1 < n2) ? -1 : (n1 > n2) ? 1 : 0; + } + }; +}; + +#ifndef ROCKSDB_LITE +// A class which remembers the name of each flushed file. +class FlushedFileCollector : public EventListener { + public: + FlushedFileCollector() {} + ~FlushedFileCollector() override {} + + void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override { + InstrumentedMutexLock lock(&mutex_); + flushed_files_.push_back(info.file_path); + } + + std::vector GetFlushedFiles() { + std::vector result; + { + InstrumentedMutexLock lock(&mutex_); + result = flushed_files_; + } + return result; + } + + void ClearFlushedFiles() { + InstrumentedMutexLock lock(&mutex_); + flushed_files_.clear(); + } + + private: + std::vector flushed_files_; + InstrumentedMutex mutex_; +}; + +TEST_F(DBBasicTestWithTimestamp, PutAndGetWithCompaction) { + const int kNumKeysPerFile = 8192; + const size_t kNumTimestamps = 2; + const size_t kNumKeysPerTimestamp = (kNumKeysPerFile - 1) / kNumTimestamps; + const size_t kSplitPosBase = kNumKeysPerTimestamp / 2; + Options options = CurrentOptions(); + options.create_if_missing = true; + options.env = env_; + options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile)); + + FlushedFileCollector* collector = new FlushedFileCollector(); + options.listeners.emplace_back(collector); + + std::string tmp; + size_t ts_sz = EncodeTimestamp(0, 0, &tmp).size(); + TestComparator test_cmp(ts_sz); + options.comparator = &test_cmp; + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy( + 10 /*bits_per_key*/, false /*use_block_based_builder*/)); + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + size_t num_cfs = handles_.size(); + ASSERT_EQ(2, num_cfs); + std::vector write_ts_strs(kNumTimestamps); + std::vector read_ts_strs(kNumTimestamps); + std::vector write_ts_list; + std::vector read_ts_list; + + for (size_t i = 0; i != kNumTimestamps; ++i) { + write_ts_list.emplace_back(EncodeTimestamp(i * 2, 0, &write_ts_strs[i])); + read_ts_list.emplace_back(EncodeTimestamp(1 + i * 2, 0, &read_ts_strs[i])); + const Slice& write_ts = write_ts_list.back(); + WriteOptions wopts; + wopts.timestamp = &write_ts; + for (int cf = 0; cf != static_cast(num_cfs); ++cf) { + for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) { + ASSERT_OK(Put(cf, "key" + std::to_string(j), + "value_" + std::to_string(j) + "_" + std::to_string(i), + wopts)); + if (j == kSplitPosBase + i || j == kNumKeysPerTimestamp - 1) { + // flush all keys with the same timestamp to two sst files, split at + // incremental positions such that lowerlevel[1].smallest.userkey == + // higherlevel[0].largest.userkey + ASSERT_OK(Flush(cf)); + + // compact files (2 at each level) to a lower level such that all keys + // with the same timestamp is at one level, with newer versions at + // higher levels. + CompactionOptions compact_opt; + compact_opt.compression = kNoCompression; + db_->CompactFiles(compact_opt, handles_[cf], + collector->GetFlushedFiles(), + static_cast(kNumTimestamps - i)); + collector->ClearFlushedFiles(); + } + } + } + } + const auto& verify_db_func = [&]() { + for (size_t i = 0; i != kNumTimestamps; ++i) { + ReadOptions ropts; + ropts.timestamp = &read_ts_list[i]; + for (int cf = 0; cf != static_cast(num_cfs); ++cf) { + ColumnFamilyHandle* cfh = handles_[cf]; + for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) { + std::string value; + ASSERT_OK(db_->Get(ropts, cfh, "key" + std::to_string(j), &value)); + ASSERT_EQ("value_" + std::to_string(j) + "_" + std::to_string(i), + value); + } + } + } + }; + verify_db_func(); +} +#endif // !ROCKSDB_LITE + +class DBBasicTestWithTimestampWithParam + : public DBBasicTestWithTimestampBase, + public testing::WithParamInterface { + public: + DBBasicTestWithTimestampWithParam() + : DBBasicTestWithTimestampBase( + "/db_basic_test_with_timestamp_with_param") {} + + protected: + class TestComparator : public TestComparatorBase { + private: + const Comparator* cmp_without_ts_; + + public: + explicit TestComparator(size_t ts_sz) + : TestComparatorBase(ts_sz), cmp_without_ts_(nullptr) { + cmp_without_ts_ = BytewiseComparator(); + } + + int CompareImpl(const Slice& a, const Slice& b) const override { + return cmp_without_ts_->Compare(a, b); + } + }; +}; + +TEST_P(DBBasicTestWithTimestampWithParam, PutAndGet) { + const int kNumKeysPerFile = 8192; + const size_t kNumTimestamps = 6; + bool memtable_only = GetParam(); + Options options = CurrentOptions(); + options.create_if_missing = true; + options.env = env_; + options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile)); + std::string tmp; + size_t ts_sz = EncodeTimestamp(0, 0, &tmp).size(); + TestComparator test_cmp(ts_sz); + options.comparator = &test_cmp; + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy( + 10 /*bits_per_key*/, false /*use_block_based_builder*/)); + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + std::vector compression_types; + compression_types.push_back(kNoCompression); + if (Zlib_Supported()) { + compression_types.push_back(kZlibCompression); + } +#if LZ4_VERSION_NUMBER >= 10400 // r124+ + compression_types.push_back(kLZ4Compression); + compression_types.push_back(kLZ4HCCompression); +#endif // LZ4_VERSION_NUMBER >= 10400 + if (ZSTD_Supported()) { + compression_types.push_back(kZSTD); + } + + // Switch compression dictionary on/off to check key extraction + // correctness in kBuffered state + std::vector max_dict_bytes_list = {0, 1 << 14}; // 0 or 16KB + + for (auto compression_type : compression_types) { + for (uint32_t max_dict_bytes : max_dict_bytes_list) { + options.compression = compression_type; + options.compression_opts.max_dict_bytes = max_dict_bytes; + if (compression_type == kZSTD) { + options.compression_opts.zstd_max_train_bytes = max_dict_bytes; + } + options.target_file_size_base = 1 << 26; // 64MB + + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + size_t num_cfs = handles_.size(); + ASSERT_EQ(2, num_cfs); + std::vector write_ts_strs(kNumTimestamps); + std::vector read_ts_strs(kNumTimestamps); + std::vector write_ts_list; + std::vector read_ts_list; + + for (size_t i = 0; i != kNumTimestamps; ++i) { + write_ts_list.emplace_back( + EncodeTimestamp(i * 2, 0, &write_ts_strs[i])); + read_ts_list.emplace_back( + EncodeTimestamp(1 + i * 2, 0, &read_ts_strs[i])); + const Slice& write_ts = write_ts_list.back(); + WriteOptions wopts; + wopts.timestamp = &write_ts; + for (int cf = 0; cf != static_cast(num_cfs); ++cf) { + for (size_t j = 0; j != (kNumKeysPerFile - 1) / kNumTimestamps; ++j) { + ASSERT_OK(Put( + cf, "key" + std::to_string(j), + "value_" + std::to_string(j) + "_" + std::to_string(i), wopts)); + } + if (!memtable_only) { + ASSERT_OK(Flush(cf)); + } + } + } + const auto& verify_db_func = [&]() { + for (size_t i = 0; i != kNumTimestamps; ++i) { + ReadOptions ropts; + ropts.timestamp = &read_ts_list[i]; + for (int cf = 0; cf != static_cast(num_cfs); ++cf) { + ColumnFamilyHandle* cfh = handles_[cf]; + for (size_t j = 0; j != (kNumKeysPerFile - 1) / kNumTimestamps; + ++j) { + std::string value; + ASSERT_OK( + db_->Get(ropts, cfh, "key" + std::to_string(j), &value)); + ASSERT_EQ("value_" + std::to_string(j) + "_" + std::to_string(i), + value); + } + } + } + }; + verify_db_func(); + } + } +} + +INSTANTIATE_TEST_CASE_P(Timestamp, DBBasicTestWithTimestampWithParam, + ::testing::Bool()); + +} // namespace ROCKSDB_NAMESPACE + +#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS +extern "C" { +void RegisterCustomObjects(int argc, char** argv); +} +#else +void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {} +#endif // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_blob_index_test.cc b/src/rocksdb/db/db_blob_index_test.cc new file mode 100644 index 000000000..24862f771 --- /dev/null +++ b/src/rocksdb/db/db_blob_index_test.cc @@ -0,0 +1,436 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include +#include + +#include "db/arena_wrapped_db_iter.h" +#include "db/column_family.h" +#include "db/db_iter.h" +#include "db/db_test_util.h" +#include "db/dbformat.h" +#include "db/write_batch_internal.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "util/string_util.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { + +// kTypeBlobIndex is a value type used by BlobDB only. The base rocksdb +// should accept the value type on write, and report not supported value +// for reads, unless caller request for it explicitly. The base rocksdb +// doesn't understand format of actual blob index (the value). +class DBBlobIndexTest : public DBTestBase { + public: + enum Tier { + kMemtable = 0, + kImmutableMemtables = 1, + kL0SstFile = 2, + kLnSstFile = 3, + }; + const std::vector kAllTiers = {Tier::kMemtable, + Tier::kImmutableMemtables, + Tier::kL0SstFile, Tier::kLnSstFile}; + + DBBlobIndexTest() : DBTestBase("/db_blob_index_test") {} + + ColumnFamilyHandle* cfh() { return dbfull()->DefaultColumnFamily(); } + + ColumnFamilyData* cfd() { + return reinterpret_cast(cfh())->cfd(); + } + + Status PutBlobIndex(WriteBatch* batch, const Slice& key, + const Slice& blob_index) { + return WriteBatchInternal::PutBlobIndex(batch, cfd()->GetID(), key, + blob_index); + } + + Status Write(WriteBatch* batch) { + return dbfull()->Write(WriteOptions(), batch); + } + + std::string GetImpl(const Slice& key, bool* is_blob_index = nullptr, + const Snapshot* snapshot = nullptr) { + ReadOptions read_options; + read_options.snapshot = snapshot; + PinnableSlice value; + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = cfh(); + get_impl_options.value = &value; + get_impl_options.is_blob_index = is_blob_index; + auto s = dbfull()->GetImpl(read_options, key, get_impl_options); + if (s.IsNotFound()) { + return "NOT_FOUND"; + } + if (s.IsNotSupported()) { + return "NOT_SUPPORTED"; + } + if (!s.ok()) { + return s.ToString(); + } + return value.ToString(); + } + + std::string GetBlobIndex(const Slice& key, + const Snapshot* snapshot = nullptr) { + bool is_blob_index = false; + std::string value = GetImpl(key, &is_blob_index, snapshot); + if (!is_blob_index) { + return "NOT_BLOB"; + } + return value; + } + + ArenaWrappedDBIter* GetBlobIterator() { + return dbfull()->NewIteratorImpl( + ReadOptions(), cfd(), dbfull()->GetLatestSequenceNumber(), + nullptr /*read_callback*/, true /*allow_blob*/); + } + + Options GetTestOptions() { + Options options; + options.create_if_missing = true; + options.num_levels = 2; + options.disable_auto_compactions = true; + // Disable auto flushes. + options.max_write_buffer_number = 10; + options.min_write_buffer_number_to_merge = 10; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + return options; + } + + void MoveDataTo(Tier tier) { + switch (tier) { + case Tier::kMemtable: + break; + case Tier::kImmutableMemtables: + ASSERT_OK(dbfull()->TEST_SwitchMemtable()); + break; + case Tier::kL0SstFile: + ASSERT_OK(Flush()); + break; + case Tier::kLnSstFile: + ASSERT_OK(Flush()); + ASSERT_OK(Put("a", "dummy")); + ASSERT_OK(Put("z", "dummy")); + ASSERT_OK(Flush()); + ASSERT_OK( + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); +#ifndef ROCKSDB_LITE + ASSERT_EQ("0,1", FilesPerLevel()); +#endif // !ROCKSDB_LITE + break; + } + } +}; + +// Should be able to write kTypeBlobIndex to memtables and SST files. +TEST_F(DBBlobIndexTest, Write) { + for (auto tier : kAllTiers) { + DestroyAndReopen(GetTestOptions()); + for (int i = 1; i <= 5; i++) { + std::string index = ToString(i); + WriteBatch batch; + ASSERT_OK(PutBlobIndex(&batch, "key" + index, "blob" + index)); + ASSERT_OK(Write(&batch)); + } + MoveDataTo(tier); + for (int i = 1; i <= 5; i++) { + std::string index = ToString(i); + ASSERT_EQ("blob" + index, GetBlobIndex("key" + index)); + } + } +} + +// Get should be able to return blob index if is_blob_index is provided, +// otherwise return Status::NotSupported status. +TEST_F(DBBlobIndexTest, Get) { + for (auto tier : kAllTiers) { + DestroyAndReopen(GetTestOptions()); + WriteBatch batch; + ASSERT_OK(batch.Put("key", "value")); + ASSERT_OK(PutBlobIndex(&batch, "blob_key", "blob_index")); + ASSERT_OK(Write(&batch)); + MoveDataTo(tier); + // Verify normal value + bool is_blob_index = false; + PinnableSlice value; + ASSERT_EQ("value", Get("key")); + ASSERT_EQ("value", GetImpl("key")); + ASSERT_EQ("value", GetImpl("key", &is_blob_index)); + ASSERT_FALSE(is_blob_index); + // Verify blob index + ASSERT_TRUE(Get("blob_key", &value).IsNotSupported()); + ASSERT_EQ("NOT_SUPPORTED", GetImpl("blob_key")); + ASSERT_EQ("blob_index", GetImpl("blob_key", &is_blob_index)); + ASSERT_TRUE(is_blob_index); + } +} + +// Get should NOT return Status::NotSupported if blob index is updated with +// a normal value. +TEST_F(DBBlobIndexTest, Updated) { + for (auto tier : kAllTiers) { + DestroyAndReopen(GetTestOptions()); + WriteBatch batch; + for (int i = 0; i < 10; i++) { + ASSERT_OK(PutBlobIndex(&batch, "key" + ToString(i), "blob_index")); + } + ASSERT_OK(Write(&batch)); + // Avoid blob values from being purged. + const Snapshot* snapshot = dbfull()->GetSnapshot(); + ASSERT_OK(Put("key1", "new_value")); + ASSERT_OK(Merge("key2", "a")); + ASSERT_OK(Merge("key2", "b")); + ASSERT_OK(Merge("key2", "c")); + ASSERT_OK(Delete("key3")); + ASSERT_OK(SingleDelete("key4")); + ASSERT_OK(Delete("key5")); + ASSERT_OK(Merge("key5", "a")); + ASSERT_OK(Merge("key5", "b")); + ASSERT_OK(Merge("key5", "c")); + ASSERT_OK(dbfull()->DeleteRange(WriteOptions(), cfh(), "key6", "key9")); + MoveDataTo(tier); + for (int i = 0; i < 10; i++) { + ASSERT_EQ("blob_index", GetBlobIndex("key" + ToString(i), snapshot)); + } + ASSERT_EQ("new_value", Get("key1")); + ASSERT_EQ("NOT_SUPPORTED", GetImpl("key2")); + ASSERT_EQ("NOT_FOUND", Get("key3")); + ASSERT_EQ("NOT_FOUND", Get("key4")); + ASSERT_EQ("a,b,c", GetImpl("key5")); + for (int i = 6; i < 9; i++) { + ASSERT_EQ("NOT_FOUND", Get("key" + ToString(i))); + } + ASSERT_EQ("blob_index", GetBlobIndex("key9")); + dbfull()->ReleaseSnapshot(snapshot); + } +} + +// Iterator should get blob value if allow_blob flag is set, +// otherwise return Status::NotSupported status. +TEST_F(DBBlobIndexTest, Iterate) { + const std::vector> data = { + /*00*/ {kTypeValue}, + /*01*/ {kTypeBlobIndex}, + /*02*/ {kTypeValue}, + /*03*/ {kTypeBlobIndex, kTypeValue}, + /*04*/ {kTypeValue}, + /*05*/ {kTypeValue, kTypeBlobIndex}, + /*06*/ {kTypeValue}, + /*07*/ {kTypeDeletion, kTypeBlobIndex}, + /*08*/ {kTypeValue}, + /*09*/ {kTypeSingleDeletion, kTypeBlobIndex}, + /*10*/ {kTypeValue}, + /*11*/ {kTypeMerge, kTypeMerge, kTypeMerge, kTypeBlobIndex}, + /*12*/ {kTypeValue}, + /*13*/ + {kTypeMerge, kTypeMerge, kTypeMerge, kTypeDeletion, kTypeBlobIndex}, + /*14*/ {kTypeValue}, + /*15*/ {kTypeBlobIndex}, + /*16*/ {kTypeValue}, + }; + + auto get_key = [](int index) { + char buf[20]; + snprintf(buf, sizeof(buf), "%02d", index); + return "key" + std::string(buf); + }; + + auto get_value = [&](int index, int version) { + return get_key(index) + "_value" + ToString(version); + }; + + auto check_iterator = [&](Iterator* iterator, Status::Code expected_status, + const Slice& expected_value) { + ASSERT_EQ(expected_status, iterator->status().code()); + if (expected_status == Status::kOk) { + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ(expected_value, iterator->value()); + } else { + ASSERT_FALSE(iterator->Valid()); + } + }; + + auto create_normal_iterator = [&]() -> Iterator* { + return dbfull()->NewIterator(ReadOptions()); + }; + + auto create_blob_iterator = [&]() -> Iterator* { return GetBlobIterator(); }; + + auto check_is_blob = [&](bool is_blob) { + return [is_blob](Iterator* iterator) { + ASSERT_EQ(is_blob, + reinterpret_cast(iterator)->IsBlob()); + }; + }; + + auto verify = [&](int index, Status::Code expected_status, + const Slice& forward_value, const Slice& backward_value, + std::function create_iterator, + std::function extra_check = nullptr) { + // Seek + auto* iterator = create_iterator(); + ASSERT_OK(iterator->Refresh()); + iterator->Seek(get_key(index)); + check_iterator(iterator, expected_status, forward_value); + if (extra_check) { + extra_check(iterator); + } + delete iterator; + + // Next + iterator = create_iterator(); + ASSERT_OK(iterator->Refresh()); + iterator->Seek(get_key(index - 1)); + ASSERT_TRUE(iterator->Valid()); + iterator->Next(); + check_iterator(iterator, expected_status, forward_value); + if (extra_check) { + extra_check(iterator); + } + delete iterator; + + // SeekForPrev + iterator = create_iterator(); + ASSERT_OK(iterator->Refresh()); + iterator->SeekForPrev(get_key(index)); + check_iterator(iterator, expected_status, backward_value); + if (extra_check) { + extra_check(iterator); + } + delete iterator; + + // Prev + iterator = create_iterator(); + iterator->Seek(get_key(index + 1)); + ASSERT_TRUE(iterator->Valid()); + iterator->Prev(); + check_iterator(iterator, expected_status, backward_value); + if (extra_check) { + extra_check(iterator); + } + delete iterator; + }; + + for (auto tier : {Tier::kMemtable} /*kAllTiers*/) { + // Avoid values from being purged. + std::vector snapshots; + DestroyAndReopen(GetTestOptions()); + + // fill data + for (int i = 0; i < static_cast(data.size()); i++) { + for (int j = static_cast(data[i].size()) - 1; j >= 0; j--) { + std::string key = get_key(i); + std::string value = get_value(i, j); + WriteBatch batch; + switch (data[i][j]) { + case kTypeValue: + ASSERT_OK(Put(key, value)); + break; + case kTypeDeletion: + ASSERT_OK(Delete(key)); + break; + case kTypeSingleDeletion: + ASSERT_OK(SingleDelete(key)); + break; + case kTypeMerge: + ASSERT_OK(Merge(key, value)); + break; + case kTypeBlobIndex: + ASSERT_OK(PutBlobIndex(&batch, key, value)); + ASSERT_OK(Write(&batch)); + break; + default: + assert(false); + }; + } + snapshots.push_back(dbfull()->GetSnapshot()); + } + ASSERT_OK( + dbfull()->DeleteRange(WriteOptions(), cfh(), get_key(15), get_key(16))); + snapshots.push_back(dbfull()->GetSnapshot()); + MoveDataTo(tier); + + // Normal iterator + verify(1, Status::kNotSupported, "", "", create_normal_iterator); + verify(3, Status::kNotSupported, "", "", create_normal_iterator); + verify(5, Status::kOk, get_value(5, 0), get_value(5, 0), + create_normal_iterator); + verify(7, Status::kOk, get_value(8, 0), get_value(6, 0), + create_normal_iterator); + verify(9, Status::kOk, get_value(10, 0), get_value(8, 0), + create_normal_iterator); + verify(11, Status::kNotSupported, "", "", create_normal_iterator); + verify(13, Status::kOk, + get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), + get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), + create_normal_iterator); + verify(15, Status::kOk, get_value(16, 0), get_value(14, 0), + create_normal_iterator); + + // Iterator with blob support + verify(1, Status::kOk, get_value(1, 0), get_value(1, 0), + create_blob_iterator, check_is_blob(true)); + verify(3, Status::kOk, get_value(3, 0), get_value(3, 0), + create_blob_iterator, check_is_blob(true)); + verify(5, Status::kOk, get_value(5, 0), get_value(5, 0), + create_blob_iterator, check_is_blob(false)); + verify(7, Status::kOk, get_value(8, 0), get_value(6, 0), + create_blob_iterator, check_is_blob(false)); + verify(9, Status::kOk, get_value(10, 0), get_value(8, 0), + create_blob_iterator, check_is_blob(false)); + verify(11, Status::kNotSupported, "", "", create_blob_iterator); + verify(13, Status::kOk, + get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), + get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), + create_blob_iterator, check_is_blob(false)); + verify(15, Status::kOk, get_value(16, 0), get_value(14, 0), + create_blob_iterator, check_is_blob(false)); + +#ifndef ROCKSDB_LITE + // Iterator with blob support and using seek. + ASSERT_OK(dbfull()->SetOptions( + cfh(), {{"max_sequential_skip_in_iterations", "0"}})); + verify(1, Status::kOk, get_value(1, 0), get_value(1, 0), + create_blob_iterator, check_is_blob(true)); + verify(3, Status::kOk, get_value(3, 0), get_value(3, 0), + create_blob_iterator, check_is_blob(true)); + verify(5, Status::kOk, get_value(5, 0), get_value(5, 0), + create_blob_iterator, check_is_blob(false)); + verify(7, Status::kOk, get_value(8, 0), get_value(6, 0), + create_blob_iterator, check_is_blob(false)); + verify(9, Status::kOk, get_value(10, 0), get_value(8, 0), + create_blob_iterator, check_is_blob(false)); + verify(11, Status::kNotSupported, "", "", create_blob_iterator); + verify(13, Status::kOk, + get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), + get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), + create_blob_iterator, check_is_blob(false)); + verify(15, Status::kOk, get_value(16, 0), get_value(14, 0), + create_blob_iterator, check_is_blob(false)); +#endif // !ROCKSDB_LITE + + for (auto* snapshot : snapshots) { + dbfull()->ReleaseSnapshot(snapshot); + } + } +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_block_cache_test.cc b/src/rocksdb/db/db_block_cache_test.cc new file mode 100644 index 000000000..3031e56bb --- /dev/null +++ b/src/rocksdb/db/db_block_cache_test.cc @@ -0,0 +1,761 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include +#include "cache/lru_cache.h" +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "util/compression.h" + +namespace ROCKSDB_NAMESPACE { + +class DBBlockCacheTest : public DBTestBase { + private: + size_t miss_count_ = 0; + size_t hit_count_ = 0; + size_t insert_count_ = 0; + size_t failure_count_ = 0; + size_t compression_dict_miss_count_ = 0; + size_t compression_dict_hit_count_ = 0; + size_t compression_dict_insert_count_ = 0; + size_t compressed_miss_count_ = 0; + size_t compressed_hit_count_ = 0; + size_t compressed_insert_count_ = 0; + size_t compressed_failure_count_ = 0; + + public: + const size_t kNumBlocks = 10; + const size_t kValueSize = 100; + + DBBlockCacheTest() : DBTestBase("/db_block_cache_test") {} + + BlockBasedTableOptions GetTableOptions() { + BlockBasedTableOptions table_options; + // Set a small enough block size so that each key-value get its own block. + table_options.block_size = 1; + return table_options; + } + + Options GetOptions(const BlockBasedTableOptions& table_options) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.avoid_flush_during_recovery = false; + // options.compression = kNoCompression; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + return options; + } + + void InitTable(const Options& /*options*/) { + std::string value(kValueSize, 'a'); + for (size_t i = 0; i < kNumBlocks; i++) { + ASSERT_OK(Put(ToString(i), value.c_str())); + } + } + + void RecordCacheCounters(const Options& options) { + miss_count_ = TestGetTickerCount(options, BLOCK_CACHE_MISS); + hit_count_ = TestGetTickerCount(options, BLOCK_CACHE_HIT); + insert_count_ = TestGetTickerCount(options, BLOCK_CACHE_ADD); + failure_count_ = TestGetTickerCount(options, BLOCK_CACHE_ADD_FAILURES); + compressed_miss_count_ = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS); + compressed_hit_count_ = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT); + compressed_insert_count_ = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD); + compressed_failure_count_ = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD_FAILURES); + } + + void RecordCacheCountersForCompressionDict(const Options& options) { + compression_dict_miss_count_ = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS); + compression_dict_hit_count_ = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_HIT); + compression_dict_insert_count_ = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD); + } + + void CheckCacheCounters(const Options& options, size_t expected_misses, + size_t expected_hits, size_t expected_inserts, + size_t expected_failures) { + size_t new_miss_count = TestGetTickerCount(options, BLOCK_CACHE_MISS); + size_t new_hit_count = TestGetTickerCount(options, BLOCK_CACHE_HIT); + size_t new_insert_count = TestGetTickerCount(options, BLOCK_CACHE_ADD); + size_t new_failure_count = + TestGetTickerCount(options, BLOCK_CACHE_ADD_FAILURES); + ASSERT_EQ(miss_count_ + expected_misses, new_miss_count); + ASSERT_EQ(hit_count_ + expected_hits, new_hit_count); + ASSERT_EQ(insert_count_ + expected_inserts, new_insert_count); + ASSERT_EQ(failure_count_ + expected_failures, new_failure_count); + miss_count_ = new_miss_count; + hit_count_ = new_hit_count; + insert_count_ = new_insert_count; + failure_count_ = new_failure_count; + } + + void CheckCacheCountersForCompressionDict( + const Options& options, size_t expected_compression_dict_misses, + size_t expected_compression_dict_hits, + size_t expected_compression_dict_inserts) { + size_t new_compression_dict_miss_count = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS); + size_t new_compression_dict_hit_count = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_HIT); + size_t new_compression_dict_insert_count = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD); + ASSERT_EQ(compression_dict_miss_count_ + expected_compression_dict_misses, + new_compression_dict_miss_count); + ASSERT_EQ(compression_dict_hit_count_ + expected_compression_dict_hits, + new_compression_dict_hit_count); + ASSERT_EQ( + compression_dict_insert_count_ + expected_compression_dict_inserts, + new_compression_dict_insert_count); + compression_dict_miss_count_ = new_compression_dict_miss_count; + compression_dict_hit_count_ = new_compression_dict_hit_count; + compression_dict_insert_count_ = new_compression_dict_insert_count; + } + + void CheckCompressedCacheCounters(const Options& options, + size_t expected_misses, + size_t expected_hits, + size_t expected_inserts, + size_t expected_failures) { + size_t new_miss_count = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS); + size_t new_hit_count = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT); + size_t new_insert_count = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD); + size_t new_failure_count = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD_FAILURES); + ASSERT_EQ(compressed_miss_count_ + expected_misses, new_miss_count); + ASSERT_EQ(compressed_hit_count_ + expected_hits, new_hit_count); + ASSERT_EQ(compressed_insert_count_ + expected_inserts, new_insert_count); + ASSERT_EQ(compressed_failure_count_ + expected_failures, new_failure_count); + compressed_miss_count_ = new_miss_count; + compressed_hit_count_ = new_hit_count; + compressed_insert_count_ = new_insert_count; + compressed_failure_count_ = new_failure_count; + } +}; + +TEST_F(DBBlockCacheTest, IteratorBlockCacheUsage) { + ReadOptions read_options; + read_options.fill_cache = false; + auto table_options = GetTableOptions(); + auto options = GetOptions(table_options); + InitTable(options); + + std::shared_ptr cache = NewLRUCache(0, 0, false); + table_options.block_cache = cache; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + Reopen(options); + RecordCacheCounters(options); + + std::vector> iterators(kNumBlocks - 1); + Iterator* iter = nullptr; + + ASSERT_EQ(0, cache->GetUsage()); + iter = db_->NewIterator(read_options); + iter->Seek(ToString(0)); + ASSERT_LT(0, cache->GetUsage()); + delete iter; + iter = nullptr; + ASSERT_EQ(0, cache->GetUsage()); +} + +TEST_F(DBBlockCacheTest, TestWithoutCompressedBlockCache) { + ReadOptions read_options; + auto table_options = GetTableOptions(); + auto options = GetOptions(table_options); + InitTable(options); + + std::shared_ptr cache = NewLRUCache(0, 0, false); + table_options.block_cache = cache; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + Reopen(options); + RecordCacheCounters(options); + + std::vector> iterators(kNumBlocks - 1); + Iterator* iter = nullptr; + + // Load blocks into cache. + for (size_t i = 0; i < kNumBlocks - 1; i++) { + iter = db_->NewIterator(read_options); + iter->Seek(ToString(i)); + ASSERT_OK(iter->status()); + CheckCacheCounters(options, 1, 0, 1, 0); + iterators[i].reset(iter); + } + size_t usage = cache->GetUsage(); + ASSERT_LT(0, usage); + cache->SetCapacity(usage); + ASSERT_EQ(usage, cache->GetPinnedUsage()); + + // Test with strict capacity limit. + cache->SetStrictCapacityLimit(true); + iter = db_->NewIterator(read_options); + iter->Seek(ToString(kNumBlocks - 1)); + ASSERT_TRUE(iter->status().IsIncomplete()); + CheckCacheCounters(options, 1, 0, 0, 1); + delete iter; + iter = nullptr; + + // Release iterators and access cache again. + for (size_t i = 0; i < kNumBlocks - 1; i++) { + iterators[i].reset(); + CheckCacheCounters(options, 0, 0, 0, 0); + } + ASSERT_EQ(0, cache->GetPinnedUsage()); + for (size_t i = 0; i < kNumBlocks - 1; i++) { + iter = db_->NewIterator(read_options); + iter->Seek(ToString(i)); + ASSERT_OK(iter->status()); + CheckCacheCounters(options, 0, 1, 0, 0); + iterators[i].reset(iter); + } +} + +#ifdef SNAPPY +TEST_F(DBBlockCacheTest, TestWithCompressedBlockCache) { + ReadOptions read_options; + auto table_options = GetTableOptions(); + auto options = GetOptions(table_options); + options.compression = CompressionType::kSnappyCompression; + InitTable(options); + + std::shared_ptr cache = NewLRUCache(0, 0, false); + std::shared_ptr compressed_cache = NewLRUCache(1 << 25, 0, false); + table_options.block_cache = cache; + table_options.block_cache_compressed = compressed_cache; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + Reopen(options); + RecordCacheCounters(options); + + std::vector> iterators(kNumBlocks - 1); + Iterator* iter = nullptr; + + // Load blocks into cache. + for (size_t i = 0; i < kNumBlocks - 1; i++) { + iter = db_->NewIterator(read_options); + iter->Seek(ToString(i)); + ASSERT_OK(iter->status()); + CheckCacheCounters(options, 1, 0, 1, 0); + CheckCompressedCacheCounters(options, 1, 0, 1, 0); + iterators[i].reset(iter); + } + size_t usage = cache->GetUsage(); + ASSERT_LT(0, usage); + ASSERT_EQ(usage, cache->GetPinnedUsage()); + size_t compressed_usage = compressed_cache->GetUsage(); + ASSERT_LT(0, compressed_usage); + // Compressed block cache cannot be pinned. + ASSERT_EQ(0, compressed_cache->GetPinnedUsage()); + + // Set strict capacity limit flag. Now block will only load into compressed + // block cache. + cache->SetCapacity(usage); + cache->SetStrictCapacityLimit(true); + ASSERT_EQ(usage, cache->GetPinnedUsage()); + iter = db_->NewIterator(read_options); + iter->Seek(ToString(kNumBlocks - 1)); + ASSERT_TRUE(iter->status().IsIncomplete()); + CheckCacheCounters(options, 1, 0, 0, 1); + CheckCompressedCacheCounters(options, 1, 0, 1, 0); + delete iter; + iter = nullptr; + + // Clear strict capacity limit flag. This time we shall hit compressed block + // cache. + cache->SetStrictCapacityLimit(false); + iter = db_->NewIterator(read_options); + iter->Seek(ToString(kNumBlocks - 1)); + ASSERT_OK(iter->status()); + CheckCacheCounters(options, 1, 0, 1, 0); + CheckCompressedCacheCounters(options, 0, 1, 0, 0); + delete iter; + iter = nullptr; +} +#endif // SNAPPY + +#ifndef ROCKSDB_LITE + +// Make sure that when options.block_cache is set, after a new table is +// created its index/filter blocks are added to block cache. +TEST_F(DBBlockCacheTest, IndexAndFilterBlocksOfNewTableAddedToCache) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(20)); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "key", "val")); + // Create a new table. + ASSERT_OK(Flush(1)); + + // index/filter blocks added to block cache right after table creation. + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(2, /* only index/filter were added */ + TestGetTickerCount(options, BLOCK_CACHE_ADD)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS)); + uint64_t int_num; + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num)); + ASSERT_EQ(int_num, 0U); + + // Make sure filter block is in cache. + std::string value; + ReadOptions ropt; + db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value); + + // Miss count should remain the same. + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + + db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + + // Make sure index block is in cache. + auto index_block_hit = TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT); + value = Get(1, "key"); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(index_block_hit + 1, + TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + + value = Get(1, "key"); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(index_block_hit + 2, + TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); +} + +// With fill_cache = false, fills up the cache, then iterates over the entire +// db, verify dummy entries inserted in `BlockBasedTable::NewDataBlockIterator` +// does not cause heap-use-after-free errors in COMPILE_WITH_ASAN=1 runs +TEST_F(DBBlockCacheTest, FillCacheAndIterateDB) { + ReadOptions read_options; + read_options.fill_cache = false; + auto table_options = GetTableOptions(); + auto options = GetOptions(table_options); + InitTable(options); + + std::shared_ptr cache = NewLRUCache(10, 0, true); + table_options.block_cache = cache; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + Reopen(options); + ASSERT_OK(Put("key1", "val1")); + ASSERT_OK(Put("key2", "val2")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("key3", "val3")); + ASSERT_OK(Put("key4", "val4")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("key5", "val5")); + ASSERT_OK(Put("key6", "val6")); + ASSERT_OK(Flush()); + + Iterator* iter = nullptr; + + iter = db_->NewIterator(read_options); + iter->Seek(ToString(0)); + while (iter->Valid()) { + iter->Next(); + } + delete iter; + iter = nullptr; +} + +TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + LRUCacheOptions co; + // 500 bytes are enough to hold the first two blocks + co.capacity = 500; + co.num_shard_bits = 0; + co.strict_capacity_limit = false; + co.metadata_charge_policy = kDontChargeCacheMetadata; + std::shared_ptr cache = NewLRUCache(co); + table_options.block_cache = cache; + table_options.filter_policy.reset(NewBloomFilterPolicy(20, true)); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "longer_key", "val")); + // Create a new table + ASSERT_OK(Flush(1)); + size_t index_bytes_insert = + TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_INSERT); + size_t filter_bytes_insert = + TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_INSERT); + ASSERT_GT(index_bytes_insert, 0); + ASSERT_GT(filter_bytes_insert, 0); + ASSERT_EQ(cache->GetUsage(), index_bytes_insert + filter_bytes_insert); + // set the cache capacity to the current usage + cache->SetCapacity(index_bytes_insert + filter_bytes_insert); + // The index and filter eviction statistics were broken by the refactoring + // that moved the readers out of the block cache. Disabling these until we can + // bring the stats back. + // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT), 0); + // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), 0); + // Note that the second key needs to be no longer than the first one. + // Otherwise the second index block may not fit in cache. + ASSERT_OK(Put(1, "key", "val")); + // Create a new table + ASSERT_OK(Flush(1)); + // cache evicted old index and block entries + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_INSERT), + index_bytes_insert); + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_INSERT), + filter_bytes_insert); + // The index and filter eviction statistics were broken by the refactoring + // that moved the readers out of the block cache. Disabling these until we can + // bring the stats back. + // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT), + // index_bytes_insert); + // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), + // filter_bytes_insert); +} + +namespace { + +// A mock cache wraps LRUCache, and record how many entries have been +// inserted for each priority. +class MockCache : public LRUCache { + public: + static uint32_t high_pri_insert_count; + static uint32_t low_pri_insert_count; + + MockCache() + : LRUCache((size_t)1 << 25 /*capacity*/, 0 /*num_shard_bits*/, + false /*strict_capacity_limit*/, 0.0 /*high_pri_pool_ratio*/) { + } + + Status Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value), Handle** handle, + Priority priority) override { + if (priority == Priority::LOW) { + low_pri_insert_count++; + } else { + high_pri_insert_count++; + } + return LRUCache::Insert(key, value, charge, deleter, handle, priority); + } +}; + +uint32_t MockCache::high_pri_insert_count = 0; +uint32_t MockCache::low_pri_insert_count = 0; + +} // anonymous namespace + +TEST_F(DBBlockCacheTest, IndexAndFilterBlocksCachePriority) { + for (auto priority : {Cache::Priority::LOW, Cache::Priority::HIGH}) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.block_cache.reset(new MockCache()); + table_options.filter_policy.reset(NewBloomFilterPolicy(20)); + table_options.cache_index_and_filter_blocks_with_high_priority = + priority == Cache::Priority::HIGH ? true : false; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + MockCache::high_pri_insert_count = 0; + MockCache::low_pri_insert_count = 0; + + // Create a new table. + ASSERT_OK(Put("foo", "value")); + ASSERT_OK(Put("bar", "value")); + ASSERT_OK(Flush()); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + + // index/filter blocks added to block cache right after table creation. + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(2, /* only index/filter were added */ + TestGetTickerCount(options, BLOCK_CACHE_ADD)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS)); + if (priority == Cache::Priority::LOW) { + ASSERT_EQ(0u, MockCache::high_pri_insert_count); + ASSERT_EQ(2u, MockCache::low_pri_insert_count); + } else { + ASSERT_EQ(2u, MockCache::high_pri_insert_count); + ASSERT_EQ(0u, MockCache::low_pri_insert_count); + } + + // Access data block. + ASSERT_EQ("value", Get("foo")); + + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(3, /*adding data block*/ + TestGetTickerCount(options, BLOCK_CACHE_ADD)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS)); + + // Data block should be inserted with low priority. + if (priority == Cache::Priority::LOW) { + ASSERT_EQ(0u, MockCache::high_pri_insert_count); + ASSERT_EQ(3u, MockCache::low_pri_insert_count); + } else { + ASSERT_EQ(2u, MockCache::high_pri_insert_count); + ASSERT_EQ(1u, MockCache::low_pri_insert_count); + } + } +} + +TEST_F(DBBlockCacheTest, ParanoidFileChecks) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.level0_file_num_compaction_trigger = 2; + options.paranoid_file_checks = true; + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = false; + table_options.filter_policy.reset(NewBloomFilterPolicy(20)); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "1_key", "val")); + ASSERT_OK(Put(1, "9_key", "val")); + // Create a new table. + ASSERT_OK(Flush(1)); + ASSERT_EQ(1, /* read and cache data block */ + TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + ASSERT_OK(Put(1, "1_key2", "val2")); + ASSERT_OK(Put(1, "9_key2", "val2")); + // Create a new SST file. This will further trigger a compaction + // and generate another file. + ASSERT_OK(Flush(1)); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(3, /* Totally 3 files created up to now */ + TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + // After disabling options.paranoid_file_checks. NO further block + // is added after generating a new file. + ASSERT_OK( + dbfull()->SetOptions(handles_[1], {{"paranoid_file_checks", "false"}})); + + ASSERT_OK(Put(1, "1_key3", "val3")); + ASSERT_OK(Put(1, "9_key3", "val3")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "1_key4", "val4")); + ASSERT_OK(Put(1, "9_key4", "val4")); + ASSERT_OK(Flush(1)); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(3, /* Totally 3 files created up to now */ + TestGetTickerCount(options, BLOCK_CACHE_ADD)); +} + +TEST_F(DBBlockCacheTest, CompressedCache) { + if (!Snappy_Supported()) { + return; + } + int num_iter = 80; + + // Run this test three iterations. + // Iteration 1: only a uncompressed block cache + // Iteration 2: only a compressed block cache + // Iteration 3: both block cache and compressed cache + // Iteration 4: both block cache and compressed cache, but DB is not + // compressed + for (int iter = 0; iter < 4; iter++) { + Options options = CurrentOptions(); + options.write_buffer_size = 64 * 1024; // small write buffer + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + + BlockBasedTableOptions table_options; + switch (iter) { + case 0: + // only uncompressed block cache + table_options.block_cache = NewLRUCache(8 * 1024); + table_options.block_cache_compressed = nullptr; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + break; + case 1: + // no block cache, only compressed cache + table_options.no_block_cache = true; + table_options.block_cache = nullptr; + table_options.block_cache_compressed = NewLRUCache(8 * 1024); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + break; + case 2: + // both compressed and uncompressed block cache + table_options.block_cache = NewLRUCache(1024); + table_options.block_cache_compressed = NewLRUCache(8 * 1024); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + break; + case 3: + // both block cache and compressed cache, but DB is not compressed + // also, make block cache sizes bigger, to trigger block cache hits + table_options.block_cache = NewLRUCache(1024 * 1024); + table_options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.compression = kNoCompression; + break; + default: + FAIL(); + } + CreateAndReopenWithCF({"pikachu"}, options); + // default column family doesn't have block cache + Options no_block_cache_opts; + no_block_cache_opts.statistics = options.statistics; + no_block_cache_opts = CurrentOptions(no_block_cache_opts); + BlockBasedTableOptions table_options_no_bc; + table_options_no_bc.no_block_cache = true; + no_block_cache_opts.table_factory.reset( + NewBlockBasedTableFactory(table_options_no_bc)); + ReopenWithColumnFamilies( + {"default", "pikachu"}, + std::vector({no_block_cache_opts, options})); + + Random rnd(301); + + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + std::vector values; + std::string str; + for (int i = 0; i < num_iter; i++) { + if (i % 4 == 0) { // high compression ratio + str = RandomString(&rnd, 1000); + } + values.push_back(str); + ASSERT_OK(Put(1, Key(i), values[i])); + } + + // flush all data from memtable so that reads are from block cache + ASSERT_OK(Flush(1)); + + for (int i = 0; i < num_iter; i++) { + ASSERT_EQ(Get(1, Key(i)), values[i]); + } + + // check that we triggered the appropriate code paths in the cache + switch (iter) { + case 0: + // only uncompressed block cache + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); + break; + case 1: + // no block cache, only compressed cache + ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); + break; + case 2: + // both compressed and uncompressed block cache + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); + break; + case 3: + // both compressed and uncompressed block cache + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_HIT), 0); + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); + // compressed doesn't have any hits since blocks are not compressed on + // storage + ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT), 0); + break; + default: + FAIL(); + } + + options.create_if_missing = true; + DestroyAndReopen(options); + } +} + +TEST_F(DBBlockCacheTest, CacheCompressionDict) { + const int kNumFiles = 4; + const int kNumEntriesPerFile = 128; + const int kNumBytesPerEntry = 1024; + + // Try all the available libraries that support dictionary compression + std::vector compression_types; + if (Zlib_Supported()) { + compression_types.push_back(kZlibCompression); + } + if (LZ4_Supported()) { + compression_types.push_back(kLZ4Compression); + compression_types.push_back(kLZ4HCCompression); + } + if (ZSTD_Supported()) { + compression_types.push_back(kZSTD); + } else if (ZSTDNotFinal_Supported()) { + compression_types.push_back(kZSTDNotFinalCompression); + } + Random rnd(301); + for (auto compression_type : compression_types) { + Options options = CurrentOptions(); + options.compression = compression_type; + options.compression_opts.max_dict_bytes = 4096; + options.create_if_missing = true; + options.num_levels = 2; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.target_file_size_base = kNumEntriesPerFile * kNumBytesPerEntry; + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.block_cache.reset(new MockCache()); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + RecordCacheCountersForCompressionDict(options); + + for (int i = 0; i < kNumFiles; ++i) { + ASSERT_EQ(i, NumTableFilesAtLevel(0, 0)); + for (int j = 0; j < kNumEntriesPerFile; ++j) { + std::string value = RandomString(&rnd, kNumBytesPerEntry); + ASSERT_OK(Put(Key(j * kNumFiles + i), value.c_str())); + } + ASSERT_OK(Flush()); + } + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(1)); + + // Compression dictionary blocks are preloaded. + CheckCacheCountersForCompressionDict( + options, kNumFiles /* expected_compression_dict_misses */, + 0 /* expected_compression_dict_hits */, + kNumFiles /* expected_compression_dict_inserts */); + + // Seek to a key in a file. It should cause the SST's dictionary meta-block + // to be read. + RecordCacheCounters(options); + RecordCacheCountersForCompressionDict(options); + ReadOptions read_options; + ASSERT_NE("NOT_FOUND", Get(Key(kNumFiles * kNumEntriesPerFile - 1))); + // Two block hits: index and dictionary since they are prefetched + // One block missed/added: data block + CheckCacheCounters(options, 1 /* expected_misses */, 2 /* expected_hits */, + 1 /* expected_inserts */, 0 /* expected_failures */); + CheckCacheCountersForCompressionDict( + options, 0 /* expected_compression_dict_misses */, + 1 /* expected_compression_dict_hits */, + 0 /* expected_compression_dict_inserts */); + } +} + +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_bloom_filter_test.cc b/src/rocksdb/db/db_bloom_filter_test.cc new file mode 100644 index 000000000..dcad00327 --- /dev/null +++ b/src/rocksdb/db/db_bloom_filter_test.cc @@ -0,0 +1,1910 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "rocksdb/perf_context.h" +#include "table/block_based/filter_policy_internal.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { +using BFP = BloomFilterPolicy; +} // namespace + +// DB tests related to bloom filter. + +class DBBloomFilterTest : public DBTestBase { + public: + DBBloomFilterTest() : DBTestBase("/db_bloom_filter_test") {} +}; + +class DBBloomFilterTestWithParam : public DBTestBase, + public testing::WithParamInterface< + std::tuple> { + // public testing::WithParamInterface { + protected: + BFP::Mode bfp_impl_; + bool partition_filters_; + uint32_t format_version_; + + public: + DBBloomFilterTestWithParam() : DBTestBase("/db_bloom_filter_tests") {} + + ~DBBloomFilterTestWithParam() override {} + + void SetUp() override { + bfp_impl_ = std::get<0>(GetParam()); + partition_filters_ = std::get<1>(GetParam()); + format_version_ = std::get<2>(GetParam()); + } +}; + +class DBBloomFilterTestDefFormatVersion : public DBBloomFilterTestWithParam {}; + +class SliceTransformLimitedDomainGeneric : public SliceTransform { + const char* Name() const override { + return "SliceTransformLimitedDomainGeneric"; + } + + Slice Transform(const Slice& src) const override { + return Slice(src.data(), 5); + } + + bool InDomain(const Slice& src) const override { + // prefix will be x???? + return src.size() >= 5; + } + + bool InRange(const Slice& dst) const override { + // prefix will be x???? + return dst.size() == 5; + } +}; + +// KeyMayExist can lead to a few false positives, but not false negatives. +// To make test deterministic, use a much larger number of bits per key-20 than +// bits in the key, so that false positives are eliminated +TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) { + do { + ReadOptions ropts; + std::string value; + anon::OptionsOverride options_override; + options_override.filter_policy.reset(new BFP(20, bfp_impl_)); + options_override.partition_filters = partition_filters_; + options_override.metadata_block_size = 32; + Options options = CurrentOptions(options_override); + if (partition_filters_ && + static_cast( + options.table_factory->GetOptions()) + ->index_type != BlockBasedTableOptions::kTwoLevelIndexSearch) { + // In the current implementation partitioned filters depend on partitioned + // indexes + continue; + } + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); + + ASSERT_OK(Put(1, "a", "b")); + bool value_found = false; + ASSERT_TRUE( + db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found)); + ASSERT_TRUE(value_found); + ASSERT_EQ("b", value); + + ASSERT_OK(Flush(1)); + value.clear(); + + uint64_t numopen = TestGetTickerCount(options, NO_FILE_OPENS); + uint64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + ASSERT_TRUE( + db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found)); + ASSERT_TRUE(!value_found); + // assert that no new files were opened and no new blocks were + // read into block cache. + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + ASSERT_OK(Delete(1, "a")); + + numopen = TestGetTickerCount(options, NO_FILE_OPENS); + cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + ASSERT_OK(Flush(1)); + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1], + true /* disallow trivial move */); + + numopen = TestGetTickerCount(options, NO_FILE_OPENS); + cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + ASSERT_OK(Delete(1, "c")); + + numopen = TestGetTickerCount(options, NO_FILE_OPENS); + cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "c", &value)); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + // KeyMayExist function only checks data in block caches, which is not used + // by plain table format. + } while ( + ChangeOptions(kSkipPlainTable | kSkipHashIndex | kSkipFIFOCompaction)); +} + +TEST_F(DBBloomFilterTest, GetFilterByPrefixBloomCustomPrefixExtractor) { + for (bool partition_filters : {true, false}) { + Options options = last_options_; + options.prefix_extractor = + std::make_shared(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + get_perf_context()->EnablePerLevelPerfContext(); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + if (partition_filters) { + bbto.partition_filters = true; + bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + } + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + WriteOptions wo; + ReadOptions ro; + FlushOptions fo; + fo.wait = true; + std::string value; + + ASSERT_OK(dbfull()->Put(wo, "barbarbar", "foo")); + ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2")); + ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar")); + + dbfull()->Flush(fo); + + ASSERT_EQ("foo", Get("barbarbar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ( + 0, + (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + ASSERT_EQ("foo2", Get("barbarbar2")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ( + 0, + (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + ASSERT_EQ("NOT_FOUND", Get("barbarbar3")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ( + 0, + (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + + ASSERT_EQ("NOT_FOUND", Get("barfoofoo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ( + 1, + (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + + ASSERT_EQ("NOT_FOUND", Get("foobarbar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2); + ASSERT_EQ( + 2, + (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + + ro.total_order_seek = true; + ASSERT_TRUE(db_->Get(ro, "foobarbar", &value).IsNotFound()); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2); + ASSERT_EQ( + 2, + (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + get_perf_context()->Reset(); + } +} + +TEST_F(DBBloomFilterTest, GetFilterByPrefixBloom) { + for (bool partition_filters : {true, false}) { + Options options = last_options_; + options.prefix_extractor.reset(NewFixedPrefixTransform(8)); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + get_perf_context()->EnablePerLevelPerfContext(); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + if (partition_filters) { + bbto.partition_filters = true; + bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + } + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + WriteOptions wo; + ReadOptions ro; + FlushOptions fo; + fo.wait = true; + std::string value; + + ASSERT_OK(dbfull()->Put(wo, "barbarbar", "foo")); + ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2")); + ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar")); + + dbfull()->Flush(fo); + + ASSERT_EQ("foo", Get("barbarbar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ("foo2", Get("barbarbar2")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ("NOT_FOUND", Get("barbarbar3")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + + ASSERT_EQ("NOT_FOUND", Get("barfoofoo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + + ASSERT_EQ("NOT_FOUND", Get("foobarbar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2); + + ro.total_order_seek = true; + ASSERT_TRUE(db_->Get(ro, "foobarbar", &value).IsNotFound()); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2); + ASSERT_EQ( + 2, + (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + get_perf_context()->Reset(); + } +} + +TEST_F(DBBloomFilterTest, WholeKeyFilterProp) { + for (bool partition_filters : {true, false}) { + Options options = last_options_; + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + get_perf_context()->EnablePerLevelPerfContext(); + + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.whole_key_filtering = false; + if (partition_filters) { + bbto.partition_filters = true; + bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + } + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + WriteOptions wo; + ReadOptions ro; + FlushOptions fo; + fo.wait = true; + std::string value; + + ASSERT_OK(dbfull()->Put(wo, "foobar", "foo")); + // Needs insert some keys to make sure files are not filtered out by key + // ranges. + ASSERT_OK(dbfull()->Put(wo, "aaa", "")); + ASSERT_OK(dbfull()->Put(wo, "zzz", "")); + dbfull()->Flush(fo); + + Reopen(options); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + + // Reopen with whole key filtering enabled and prefix extractor + // NULL. Bloom filter should be off for both of whole key and + // prefix bloom. + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options.prefix_extractor.reset(); + Reopen(options); + + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + // Write DB with only full key filtering. + ASSERT_OK(dbfull()->Put(wo, "foobar", "foo")); + // Needs insert some keys to make sure files are not filtered out by key + // ranges. + ASSERT_OK(dbfull()->Put(wo, "aaa", "")); + ASSERT_OK(dbfull()->Put(wo, "zzz", "")); + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + + // Reopen with both of whole key off and prefix extractor enabled. + // Still no bloom filter should be used. + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + + // Try to create a DB with mixed files: + ASSERT_OK(dbfull()->Put(wo, "foobar", "foo")); + // Needs insert some keys to make sure files are not filtered out by key + // ranges. + ASSERT_OK(dbfull()->Put(wo, "aaa", "")); + ASSERT_OK(dbfull()->Put(wo, "zzz", "")); + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + + options.prefix_extractor.reset(); + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + + // Try to create a DB with mixed files. + ASSERT_OK(dbfull()->Put(wo, "barfoo", "bar")); + // In this case needs insert some keys to make sure files are + // not filtered out by key ranges. + ASSERT_OK(dbfull()->Put(wo, "aaa", "")); + ASSERT_OK(dbfull()->Put(wo, "zzz", "")); + Flush(); + + // Now we have two files: + // File 1: An older file with prefix bloom. + // File 2: A newer file with whole bloom filter. + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4); + ASSERT_EQ("bar", Get("barfoo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4); + + // Reopen with the same setting: only whole key is used + Reopen(options); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 5); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 6); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7); + ASSERT_EQ("bar", Get("barfoo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7); + + // Restart with both filters are allowed + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7); + // File 1 will has it filtered out. + // File 2 will not, as prefix `foo` exists in the file. + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 8); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 10); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); + ASSERT_EQ("bar", Get("barfoo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); + + // Restart with only prefix bloom is allowed. + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12); + ASSERT_EQ("bar", Get("barfoo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12); + uint64_t bloom_filter_useful_all_levels = 0; + for (auto& kv : (*(get_perf_context()->level_to_perf_context))) { + if (kv.second.bloom_filter_useful > 0) { + bloom_filter_useful_all_levels += kv.second.bloom_filter_useful; + } + } + ASSERT_EQ(12, bloom_filter_useful_all_levels); + get_perf_context()->Reset(); + } +} + +TEST_P(DBBloomFilterTestWithParam, BloomFilter) { + do { + Options options = CurrentOptions(); + env_->count_random_reads_ = true; + options.env = env_; + // ChangeCompactOptions() only changes compaction style, which does not + // trigger reset of table_factory + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + table_options.filter_policy.reset(new BFP(10, bfp_impl_)); + table_options.partition_filters = partition_filters_; + if (partition_filters_) { + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + } + table_options.format_version = format_version_; + if (format_version_ >= 4) { + // value delta encoding challenged more with index interval > 1 + table_options.index_block_restart_interval = 8; + } + table_options.metadata_block_size = 32; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + CreateAndReopenWithCF({"pikachu"}, options); + + // Populate multiple layers + const int N = 10000; + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + Compact(1, "a", "z"); + for (int i = 0; i < N; i += 100) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + Flush(1); + + // Prevent auto compactions triggered by seeks + env_->delay_sstable_sync_.store(true, std::memory_order_release); + + // Lookup present keys. Should rarely read from small sstable. + env_->random_read_counter_.Reset(); + for (int i = 0; i < N; i++) { + ASSERT_EQ(Key(i), Get(1, Key(i))); + } + int reads = env_->random_read_counter_.Read(); + fprintf(stderr, "%d present => %d reads\n", N, reads); + ASSERT_GE(reads, N); + if (partition_filters_) { + // Without block cache, we read an extra partition filter per each + // level*read and a partition index per each read + ASSERT_LE(reads, 4 * N + 2 * N / 100); + } else { + ASSERT_LE(reads, N + 2 * N / 100); + } + + // Lookup present keys. Should rarely read from either sstable. + env_->random_read_counter_.Reset(); + for (int i = 0; i < N; i++) { + ASSERT_EQ("NOT_FOUND", Get(1, Key(i) + ".missing")); + } + reads = env_->random_read_counter_.Read(); + fprintf(stderr, "%d missing => %d reads\n", N, reads); + if (partition_filters_) { + // With partitioned filter we read one extra filter per level per each + // missed read. + ASSERT_LE(reads, 2 * N + 3 * N / 100); + } else { + ASSERT_LE(reads, 3 * N / 100); + } + + env_->delay_sstable_sync_.store(false, std::memory_order_release); + Close(); + } while (ChangeCompactOptions()); +} + +#ifndef ROCKSDB_VALGRIND_RUN +INSTANTIATE_TEST_CASE_P( + FormatDef, DBBloomFilterTestDefFormatVersion, + ::testing::Values( + std::make_tuple(BFP::kDeprecatedBlock, false, + test::kDefaultFormatVersion), + std::make_tuple(BFP::kAuto, true, test::kDefaultFormatVersion), + std::make_tuple(BFP::kAuto, false, test::kDefaultFormatVersion))); + +INSTANTIATE_TEST_CASE_P( + FormatDef, DBBloomFilterTestWithParam, + ::testing::Values( + std::make_tuple(BFP::kDeprecatedBlock, false, + test::kDefaultFormatVersion), + std::make_tuple(BFP::kAuto, true, test::kDefaultFormatVersion), + std::make_tuple(BFP::kAuto, false, test::kDefaultFormatVersion))); + +INSTANTIATE_TEST_CASE_P( + FormatLatest, DBBloomFilterTestWithParam, + ::testing::Values( + std::make_tuple(BFP::kDeprecatedBlock, false, + test::kLatestFormatVersion), + std::make_tuple(BFP::kAuto, true, test::kLatestFormatVersion), + std::make_tuple(BFP::kAuto, false, test::kLatestFormatVersion))); +#endif // ROCKSDB_VALGRIND_RUN + +TEST_F(DBBloomFilterTest, BloomFilterRate) { + while (ChangeFilterOptions()) { + Options options = CurrentOptions(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + get_perf_context()->EnablePerLevelPerfContext(); + CreateAndReopenWithCF({"pikachu"}, options); + + const int maxKey = 10000; + for (int i = 0; i < maxKey; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + // Add a large key to make the file contain wide range + ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); + Flush(1); + + // Check if they can be found + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ(Key(i), Get(1, Key(i))); + } + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + + // Check if filter is useful + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333))); + } + ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey * 0.98); + ASSERT_GE( + (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful, + maxKey * 0.98); + get_perf_context()->Reset(); + } +} + +TEST_F(DBBloomFilterTest, BloomFilterCompatibility) { + Options options = CurrentOptions(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + // Create with block based filter + CreateAndReopenWithCF({"pikachu"}, options); + + const int maxKey = 10000; + for (int i = 0; i < maxKey; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); + Flush(1); + + // Check db with full filter + table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + // Check if they can be found + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ(Key(i), Get(1, Key(i))); + } + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + + // Check db with partitioned full filter + table_options.partition_filters = true; + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + // Check if they can be found + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ(Key(i), Get(1, Key(i))); + } + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); +} + +TEST_F(DBBloomFilterTest, BloomFilterReverseCompatibility) { + for (bool partition_filters : {true, false}) { + Options options = CurrentOptions(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + BlockBasedTableOptions table_options; + if (partition_filters) { + table_options.partition_filters = true; + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + } + table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + // Create with full filter + CreateAndReopenWithCF({"pikachu"}, options); + + const int maxKey = 10000; + for (int i = 0; i < maxKey; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); + Flush(1); + + // Check db with block_based filter + table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + // Check if they can be found + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ(Key(i), Get(1, Key(i))); + } + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + } +} + +namespace { +// A wrapped bloom over block-based FilterPolicy +class TestingWrappedBlockBasedFilterPolicy : public FilterPolicy { + public: + explicit TestingWrappedBlockBasedFilterPolicy(int bits_per_key) + : filter_(NewBloomFilterPolicy(bits_per_key, true)), counter_(0) {} + + ~TestingWrappedBlockBasedFilterPolicy() override { delete filter_; } + + const char* Name() const override { + return "TestingWrappedBlockBasedFilterPolicy"; + } + + void CreateFilter(const ROCKSDB_NAMESPACE::Slice* keys, int n, + std::string* dst) const override { + std::unique_ptr user_keys( + new ROCKSDB_NAMESPACE::Slice[n]); + for (int i = 0; i < n; ++i) { + user_keys[i] = convertKey(keys[i]); + } + return filter_->CreateFilter(user_keys.get(), n, dst); + } + + bool KeyMayMatch(const ROCKSDB_NAMESPACE::Slice& key, + const ROCKSDB_NAMESPACE::Slice& filter) const override { + counter_++; + return filter_->KeyMayMatch(convertKey(key), filter); + } + + uint32_t GetCounter() { return counter_; } + + private: + const FilterPolicy* filter_; + mutable uint32_t counter_; + + ROCKSDB_NAMESPACE::Slice convertKey( + const ROCKSDB_NAMESPACE::Slice& key) const { + return key; + } +}; +} // namespace + +TEST_F(DBBloomFilterTest, WrappedBlockBasedFilterPolicy) { + Options options = CurrentOptions(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + + BlockBasedTableOptions table_options; + TestingWrappedBlockBasedFilterPolicy* policy = + new TestingWrappedBlockBasedFilterPolicy(10); + table_options.filter_policy.reset(policy); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + CreateAndReopenWithCF({"pikachu"}, options); + + const int maxKey = 10000; + for (int i = 0; i < maxKey; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + // Add a large key to make the file contain wide range + ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); + ASSERT_EQ(0U, policy->GetCounter()); + Flush(1); + + // Check if they can be found + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ(Key(i), Get(1, Key(i))); + } + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ(1U * maxKey, policy->GetCounter()); + + // Check if filter is useful + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333))); + } + ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey * 0.98); + ASSERT_EQ(2U * maxKey, policy->GetCounter()); +} + +namespace { +// NOTE: This class is referenced by HISTORY.md as a model for a wrapper +// FilterPolicy selecting among configurations based on context. +class LevelAndStyleCustomFilterPolicy : public FilterPolicy { + public: + explicit LevelAndStyleCustomFilterPolicy(int bpk_fifo, int bpk_l0_other, + int bpk_otherwise) + : policy_fifo_(NewBloomFilterPolicy(bpk_fifo)), + policy_l0_other_(NewBloomFilterPolicy(bpk_l0_other)), + policy_otherwise_(NewBloomFilterPolicy(bpk_otherwise)) {} + + // OK to use built-in policy name because we are deferring to a + // built-in builder. We aren't changing the serialized format. + const char* Name() const override { return policy_fifo_->Name(); } + + FilterBitsBuilder* GetBuilderWithContext( + const FilterBuildingContext& context) const override { + if (context.compaction_style == kCompactionStyleFIFO) { + return policy_fifo_->GetBuilderWithContext(context); + } else if (context.level_at_creation == 0) { + return policy_l0_other_->GetBuilderWithContext(context); + } else { + return policy_otherwise_->GetBuilderWithContext(context); + } + } + + FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override { + // OK to defer to any of them; they all can parse built-in filters + // from any settings. + return policy_fifo_->GetFilterBitsReader(contents); + } + + // Defer just in case configuration uses block-based filter + void CreateFilter(const Slice* keys, int n, std::string* dst) const override { + policy_otherwise_->CreateFilter(keys, n, dst); + } + bool KeyMayMatch(const Slice& key, const Slice& filter) const override { + return policy_otherwise_->KeyMayMatch(key, filter); + } + + private: + const std::unique_ptr policy_fifo_; + const std::unique_ptr policy_l0_other_; + const std::unique_ptr policy_otherwise_; +}; + +class TestingContextCustomFilterPolicy + : public LevelAndStyleCustomFilterPolicy { + public: + explicit TestingContextCustomFilterPolicy(int bpk_fifo, int bpk_l0_other, + int bpk_otherwise) + : LevelAndStyleCustomFilterPolicy(bpk_fifo, bpk_l0_other, bpk_otherwise) { + } + + FilterBitsBuilder* GetBuilderWithContext( + const FilterBuildingContext& context) const override { + test_report_ += "cf="; + test_report_ += context.column_family_name; + test_report_ += ",cs="; + test_report_ += + OptionsHelper::compaction_style_to_string[context.compaction_style]; + test_report_ += ",lv="; + test_report_ += std::to_string(context.level_at_creation); + test_report_ += "\n"; + + return LevelAndStyleCustomFilterPolicy::GetBuilderWithContext(context); + } + + std::string DumpTestReport() { + std::string rv; + std::swap(rv, test_report_); + return rv; + } + + private: + mutable std::string test_report_; +}; +} // namespace + +TEST_F(DBBloomFilterTest, ContextCustomFilterPolicy) { + for (bool fifo : {true, false}) { + Options options = CurrentOptions(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.compaction_style = + fifo ? kCompactionStyleFIFO : kCompactionStyleLevel; + + BlockBasedTableOptions table_options; + auto policy = std::make_shared(15, 8, 5); + table_options.filter_policy = policy; + table_options.format_version = 5; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + CreateAndReopenWithCF({fifo ? "abe" : "bob"}, options); + + const int maxKey = 10000; + for (int i = 0; i < maxKey / 2; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + // Add a large key to make the file contain wide range + ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); + Flush(1); + EXPECT_EQ(policy->DumpTestReport(), + fifo ? "cf=abe,cs=kCompactionStyleFIFO,lv=0\n" + : "cf=bob,cs=kCompactionStyleLevel,lv=0\n"); + + for (int i = maxKey / 2; i < maxKey; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + Flush(1); + EXPECT_EQ(policy->DumpTestReport(), + fifo ? "cf=abe,cs=kCompactionStyleFIFO,lv=0\n" + : "cf=bob,cs=kCompactionStyleLevel,lv=0\n"); + + // Check that they can be found + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ(Key(i), Get(1, Key(i))); + } + // Since we have two tables / two filters, we might have Bloom checks on + // our queries, but no more than one "useful" per query on a found key. + EXPECT_LE(TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey); + + // Check that we have two filters, each about + // fifo: 0.12% FP rate (15 bits per key) + // level: 2.3% FP rate (8 bits per key) + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333))); + } + { + auto useful_count = + TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL); + EXPECT_GE(useful_count, maxKey * 2 * (fifo ? 0.9980 : 0.975)); + EXPECT_LE(useful_count, maxKey * 2 * (fifo ? 0.9995 : 0.98)); + } + + if (!fifo) { // FIFO only has L0 + // Full compaction + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr)); + EXPECT_EQ(policy->DumpTestReport(), + "cf=bob,cs=kCompactionStyleLevel,lv=1\n"); + + // Check that we now have one filter, about 9.2% FP rate (5 bits per key) + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333))); + } + { + auto useful_count = + TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL); + EXPECT_GE(useful_count, maxKey * 0.90); + EXPECT_LE(useful_count, maxKey * 0.91); + } + } + + // Destroy + ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); + dbfull()->DestroyColumnFamilyHandle(handles_[1]); + handles_[1] = nullptr; + } +} + +class SliceTransformLimitedDomain : public SliceTransform { + const char* Name() const override { return "SliceTransformLimitedDomain"; } + + Slice Transform(const Slice& src) const override { + return Slice(src.data(), 5); + } + + bool InDomain(const Slice& src) const override { + // prefix will be x???? + return src.size() >= 5 && src[0] == 'x'; + } + + bool InRange(const Slice& dst) const override { + // prefix will be x???? + return dst.size() == 5 && dst[0] == 'x'; + } +}; + +TEST_F(DBBloomFilterTest, PrefixExtractorFullFilter) { + BlockBasedTableOptions bbto; + // Full Filter Block + bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10, false)); + bbto.whole_key_filtering = false; + + Options options = CurrentOptions(); + options.prefix_extractor = std::make_shared(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + DestroyAndReopen(options); + + ASSERT_OK(Put("x1111_AAAA", "val1")); + ASSERT_OK(Put("x1112_AAAA", "val2")); + ASSERT_OK(Put("x1113_AAAA", "val3")); + ASSERT_OK(Put("x1114_AAAA", "val4")); + // Not in domain, wont be added to filter + ASSERT_OK(Put("zzzzz_AAAA", "val5")); + + ASSERT_OK(Flush()); + + ASSERT_EQ(Get("x1111_AAAA"), "val1"); + ASSERT_EQ(Get("x1112_AAAA"), "val2"); + ASSERT_EQ(Get("x1113_AAAA"), "val3"); + ASSERT_EQ(Get("x1114_AAAA"), "val4"); + // Was not added to filter but rocksdb will try to read it from the filter + ASSERT_EQ(Get("zzzzz_AAAA"), "val5"); +} + +TEST_F(DBBloomFilterTest, PrefixExtractorBlockFilter) { + BlockBasedTableOptions bbto; + // Block Filter Block + bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10, true)); + + Options options = CurrentOptions(); + options.prefix_extractor = std::make_shared(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + DestroyAndReopen(options); + + ASSERT_OK(Put("x1113_AAAA", "val3")); + ASSERT_OK(Put("x1114_AAAA", "val4")); + // Not in domain, wont be added to filter + ASSERT_OK(Put("zzzzz_AAAA", "val1")); + ASSERT_OK(Put("zzzzz_AAAB", "val2")); + ASSERT_OK(Put("zzzzz_AAAC", "val3")); + ASSERT_OK(Put("zzzzz_AAAD", "val4")); + + ASSERT_OK(Flush()); + + std::vector iter_res; + auto iter = db_->NewIterator(ReadOptions()); + // Seek to a key that was not in Domain + for (iter->Seek("zzzzz_AAAA"); iter->Valid(); iter->Next()) { + iter_res.emplace_back(iter->value().ToString()); + } + + std::vector expected_res = {"val1", "val2", "val3", "val4"}; + ASSERT_EQ(iter_res, expected_res); + delete iter; +} + +TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilter) { + // regression test for #2743. the range delete tombstones in memtable should + // be added even when Get() skips searching due to its prefix bloom filter + const int kMemtableSize = 1 << 20; // 1MB + const int kMemtablePrefixFilterSize = 1 << 13; // 8KB + const int kPrefixLen = 4; + Options options = CurrentOptions(); + options.memtable_prefix_bloom_size_ratio = + static_cast(kMemtablePrefixFilterSize) / kMemtableSize; + options.prefix_extractor.reset( + ROCKSDB_NAMESPACE::NewFixedPrefixTransform(kPrefixLen)); + options.write_buffer_size = kMemtableSize; + options.memtable_whole_key_filtering = false; + Reopen(options); + std::string key1("AAAABBBB"); + std::string key2("AAAACCCC"); // not in DB + std::string key3("AAAADDDD"); + std::string key4("AAAAEEEE"); + std::string value1("Value1"); + std::string value3("Value3"); + std::string value4("Value4"); + + ASSERT_OK(Put(key1, value1, WriteOptions())); + + // check memtable bloom stats + ASSERT_EQ("NOT_FOUND", Get(key2)); + ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count); + // same prefix, bloom filter false positive + ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count); + + // enable whole key bloom filter + options.memtable_whole_key_filtering = true; + Reopen(options); + // check memtable bloom stats + ASSERT_OK(Put(key3, value3, WriteOptions())); + ASSERT_EQ("NOT_FOUND", Get(key2)); + // whole key bloom filter kicks in and determines it's a miss + ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count); + ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count); + + // verify whole key filtering does not depend on prefix_extractor + options.prefix_extractor.reset(); + Reopen(options); + // check memtable bloom stats + ASSERT_OK(Put(key4, value4, WriteOptions())); + ASSERT_EQ("NOT_FOUND", Get(key2)); + // whole key bloom filter kicks in and determines it's a miss + ASSERT_EQ(2, get_perf_context()->bloom_memtable_miss_count); + ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count); +} + +TEST_F(DBBloomFilterTest, MemtablePrefixBloomOutOfDomain) { + constexpr size_t kPrefixSize = 8; + const std::string kKey = "key"; + assert(kKey.size() < kPrefixSize); + Options options = CurrentOptions(); + options.prefix_extractor.reset(NewFixedPrefixTransform(kPrefixSize)); + options.memtable_prefix_bloom_size_ratio = 0.25; + Reopen(options); + ASSERT_OK(Put(kKey, "v")); + ASSERT_EQ("v", Get(kKey)); + std::unique_ptr iter(dbfull()->NewIterator(ReadOptions())); + iter->Seek(kKey); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(kKey, iter->key()); + iter->SeekForPrev(kKey); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(kKey, iter->key()); +} + +#ifndef ROCKSDB_LITE +namespace { +namespace BFP2 { +// Extends BFP::Mode with option to use Plain table +using PseudoMode = int; +static constexpr PseudoMode kPlainTable = -1; +} // namespace BFP2 +} // namespace + +class BloomStatsTestWithParam + : public DBBloomFilterTest, + public testing::WithParamInterface> { + public: + BloomStatsTestWithParam() { + bfp_impl_ = std::get<0>(GetParam()); + partition_filters_ = std::get<1>(GetParam()); + + options_.create_if_missing = true; + options_.prefix_extractor.reset( + ROCKSDB_NAMESPACE::NewFixedPrefixTransform(4)); + options_.memtable_prefix_bloom_size_ratio = + 8.0 * 1024.0 / static_cast(options_.write_buffer_size); + if (bfp_impl_ == BFP2::kPlainTable) { + assert(!partition_filters_); // not supported in plain table + PlainTableOptions table_options; + options_.table_factory.reset(NewPlainTableFactory(table_options)); + } else { + BlockBasedTableOptions table_options; + table_options.hash_index_allow_collision = false; + if (partition_filters_) { + assert(bfp_impl_ != BFP::kDeprecatedBlock); + table_options.partition_filters = partition_filters_; + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + } + table_options.filter_policy.reset( + new BFP(10, static_cast(bfp_impl_))); + options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); + } + options_.env = env_; + + get_perf_context()->Reset(); + DestroyAndReopen(options_); + } + + ~BloomStatsTestWithParam() override { + get_perf_context()->Reset(); + Destroy(options_); + } + + // Required if inheriting from testing::WithParamInterface<> + static void SetUpTestCase() {} + static void TearDownTestCase() {} + + BFP2::PseudoMode bfp_impl_; + bool partition_filters_; + Options options_; +}; + +// 1 Insert 2 K-V pairs into DB +// 2 Call Get() for both keys - expext memtable bloom hit stat to be 2 +// 3 Call Get() for nonexisting key - expect memtable bloom miss stat to be 1 +// 4 Call Flush() to create SST +// 5 Call Get() for both keys - expext SST bloom hit stat to be 2 +// 6 Call Get() for nonexisting key - expect SST bloom miss stat to be 1 +// Test both: block and plain SST +TEST_P(BloomStatsTestWithParam, BloomStatsTest) { + std::string key1("AAAA"); + std::string key2("RXDB"); // not in DB + std::string key3("ZBRA"); + std::string value1("Value1"); + std::string value3("Value3"); + + ASSERT_OK(Put(key1, value1, WriteOptions())); + ASSERT_OK(Put(key3, value3, WriteOptions())); + + // check memtable bloom stats + ASSERT_EQ(value1, Get(key1)); + ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count); + ASSERT_EQ(value3, Get(key3)); + ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count); + ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count); + + ASSERT_EQ("NOT_FOUND", Get(key2)); + ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count); + ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count); + + // sanity checks + ASSERT_EQ(0, get_perf_context()->bloom_sst_hit_count); + ASSERT_EQ(0, get_perf_context()->bloom_sst_miss_count); + + Flush(); + + // sanity checks + ASSERT_EQ(0, get_perf_context()->bloom_sst_hit_count); + ASSERT_EQ(0, get_perf_context()->bloom_sst_miss_count); + + // check SST bloom stats + ASSERT_EQ(value1, Get(key1)); + ASSERT_EQ(1, get_perf_context()->bloom_sst_hit_count); + ASSERT_EQ(value3, Get(key3)); + ASSERT_EQ(2, get_perf_context()->bloom_sst_hit_count); + + ASSERT_EQ("NOT_FOUND", Get(key2)); + ASSERT_EQ(1, get_perf_context()->bloom_sst_miss_count); +} + +// Same scenario as in BloomStatsTest but using an iterator +TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) { + std::string key1("AAAA"); + std::string key2("RXDB"); // not in DB + std::string key3("ZBRA"); + std::string value1("Value1"); + std::string value3("Value3"); + + ASSERT_OK(Put(key1, value1, WriteOptions())); + ASSERT_OK(Put(key3, value3, WriteOptions())); + + std::unique_ptr iter(dbfull()->NewIterator(ReadOptions())); + + // check memtable bloom stats + iter->Seek(key1); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(value1, iter->value().ToString()); + ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count); + ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count); + + iter->Seek(key3); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(value3, iter->value().ToString()); + ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count); + ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count); + + iter->Seek(key2); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count); + ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count); + + Flush(); + + iter.reset(dbfull()->NewIterator(ReadOptions())); + + // Check SST bloom stats + iter->Seek(key1); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(value1, iter->value().ToString()); + ASSERT_EQ(1, get_perf_context()->bloom_sst_hit_count); + + iter->Seek(key3); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(value3, iter->value().ToString()); + // The seek doesn't check block-based bloom filter because last index key + // starts with the same prefix we're seeking to. + uint64_t expected_hits = bfp_impl_ == BFP::kDeprecatedBlock ? 1 : 2; + ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count); + + iter->Seek(key2); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + ASSERT_EQ(1, get_perf_context()->bloom_sst_miss_count); + ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count); +} + +INSTANTIATE_TEST_CASE_P( + BloomStatsTestWithParam, BloomStatsTestWithParam, + ::testing::Values(std::make_tuple(BFP::kDeprecatedBlock, false), + std::make_tuple(BFP::kLegacyBloom, false), + std::make_tuple(BFP::kLegacyBloom, true), + std::make_tuple(BFP::kFastLocalBloom, false), + std::make_tuple(BFP::kFastLocalBloom, true), + std::make_tuple(BFP2::kPlainTable, false))); + +namespace { +void PrefixScanInit(DBBloomFilterTest* dbtest) { + char buf[100]; + std::string keystr; + const int small_range_sstfiles = 5; + const int big_range_sstfiles = 5; + + // Generate 11 sst files with the following prefix ranges. + // GROUP 0: [0,10] (level 1) + // GROUP 1: [1,2], [2,3], [3,4], [4,5], [5, 6] (level 0) + // GROUP 2: [0,6], [0,7], [0,8], [0,9], [0,10] (level 0) + // + // A seek with the previous API would do 11 random I/Os (to all the + // files). With the new API and a prefix filter enabled, we should + // only do 2 random I/O, to the 2 files containing the key. + + // GROUP 0 + snprintf(buf, sizeof(buf), "%02d______:start", 0); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + snprintf(buf, sizeof(buf), "%02d______:end", 10); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + dbtest->Flush(); + dbtest->dbfull()->CompactRange(CompactRangeOptions(), nullptr, + nullptr); // move to level 1 + + // GROUP 1 + for (int i = 1; i <= small_range_sstfiles; i++) { + snprintf(buf, sizeof(buf), "%02d______:start", i); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + snprintf(buf, sizeof(buf), "%02d______:end", i + 1); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + dbtest->Flush(); + } + + // GROUP 2 + for (int i = 1; i <= big_range_sstfiles; i++) { + snprintf(buf, sizeof(buf), "%02d______:start", 0); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + snprintf(buf, sizeof(buf), "%02d______:end", small_range_sstfiles + i + 1); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + dbtest->Flush(); + } +} +} // namespace + +TEST_F(DBBloomFilterTest, PrefixScan) { + while (ChangeFilterOptions()) { + int count; + Slice prefix; + Slice key; + char buf[100]; + Iterator* iter; + snprintf(buf, sizeof(buf), "03______:"); + prefix = Slice(buf, 8); + key = Slice(buf, 9); + ASSERT_EQ(key.difference_offset(prefix), 8); + ASSERT_EQ(prefix.difference_offset(key), 8); + // db configs + env_->count_random_reads_ = true; + Options options = CurrentOptions(); + options.env = env_; + options.prefix_extractor.reset(NewFixedPrefixTransform(8)); + options.disable_auto_compactions = true; + options.max_background_compactions = 2; + options.create_if_missing = true; + options.memtable_factory.reset(NewHashSkipListRepFactory(16)); + assert(!options.unordered_write); + // It is incompatible with allow_concurrent_memtable_write=false + options.allow_concurrent_memtable_write = false; + + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); + table_options.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + // 11 RAND I/Os + DestroyAndReopen(options); + PrefixScanInit(this); + count = 0; + env_->random_read_counter_.Reset(); + iter = db_->NewIterator(ReadOptions()); + for (iter->Seek(prefix); iter->Valid(); iter->Next()) { + if (!iter->key().starts_with(prefix)) { + break; + } + count++; + } + ASSERT_OK(iter->status()); + delete iter; + ASSERT_EQ(count, 2); + ASSERT_EQ(env_->random_read_counter_.Read(), 2); + Close(); + } // end of while +} + +TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) { + Options options = CurrentOptions(); + options.write_buffer_size = 64 * 1024; + options.arena_block_size = 4 * 1024; + options.target_file_size_base = 64 * 1024; + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 2; + options.level0_stop_writes_trigger = 4; + options.max_bytes_for_level_base = 256 * 1024; + options.max_write_buffer_number = 2; + options.max_background_compactions = 8; + options.max_background_flushes = 8; + options.compression = kNoCompression; + options.compaction_style = kCompactionStyleLevel; + options.level_compaction_dynamic_level_bytes = true; + BlockBasedTableOptions bbto; + bbto.cache_index_and_filter_blocks = true; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, true)); + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options.optimize_filters_for_hits = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + get_perf_context()->Reset(); + get_perf_context()->EnablePerLevelPerfContext(); + CreateAndReopenWithCF({"mypikachu"}, options); + + int numkeys = 200000; + + // Generate randomly shuffled keys, so the updates are almost + // random. + std::vector keys; + keys.reserve(numkeys); + for (int i = 0; i < numkeys; i += 2) { + keys.push_back(i); + } + std::random_shuffle(std::begin(keys), std::end(keys)); + + int num_inserted = 0; + for (int key : keys) { + ASSERT_OK(Put(1, Key(key), "val")); + if (++num_inserted % 1000 == 0) { + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + } + } + ASSERT_OK(Put(1, Key(0), "val")); + ASSERT_OK(Put(1, Key(numkeys), "val")); + ASSERT_OK(Flush(1)); + dbfull()->TEST_WaitForCompact(); + + if (NumTableFilesAtLevel(0, 1) == 0) { + // No Level 0 file. Create one. + ASSERT_OK(Put(1, Key(0), "val")); + ASSERT_OK(Put(1, Key(numkeys), "val")); + ASSERT_OK(Flush(1)); + dbfull()->TEST_WaitForCompact(); + } + + for (int i = 1; i < numkeys; i += 2) { + ASSERT_EQ(Get(1, Key(i)), "NOT_FOUND"); + } + + ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0)); + ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1)); + ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP)); + + // Now we have three sorted run, L0, L5 and L6 with most files in L6 have + // no bloom filter. Most keys be checked bloom filters twice. + ASSERT_GT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 65000 * 2); + ASSERT_LT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 120000 * 2); + uint64_t bloom_filter_useful_all_levels = 0; + for (auto& kv : (*(get_perf_context()->level_to_perf_context))) { + if (kv.second.bloom_filter_useful > 0) { + bloom_filter_useful_all_levels += kv.second.bloom_filter_useful; + } + } + ASSERT_GT(bloom_filter_useful_all_levels, 65000 * 2); + ASSERT_LT(bloom_filter_useful_all_levels, 120000 * 2); + + for (int i = 0; i < numkeys; i += 2) { + ASSERT_EQ(Get(1, Key(i)), "val"); + } + + // Part 2 (read path): rewrite last level with blooms, then verify they get + // cached only if !optimize_filters_for_hits + options.disable_auto_compactions = true; + options.num_levels = 9; + options.optimize_filters_for_hits = false; + options.statistics = CreateDBStatistics(); + bbto.block_cache.reset(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + ReopenWithColumnFamilies({"default", "mypikachu"}, options); + MoveFilesToLevel(7 /* level */, 1 /* column family index */); + + std::string value = Get(1, Key(0)); + uint64_t prev_cache_filter_hits = + TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); + value = Get(1, Key(0)); + ASSERT_EQ(prev_cache_filter_hits + 1, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + + // Now that we know the filter blocks exist in the last level files, see if + // filter caching is skipped for this optimization + options.optimize_filters_for_hits = true; + options.statistics = CreateDBStatistics(); + bbto.block_cache.reset(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + ReopenWithColumnFamilies({"default", "mypikachu"}, options); + + value = Get(1, Key(0)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(2 /* index and data block */, + TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + // Check filter block ignored for files preloaded during DB::Open() + options.max_open_files = -1; + options.statistics = CreateDBStatistics(); + bbto.block_cache.reset(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + ReopenWithColumnFamilies({"default", "mypikachu"}, options); + + uint64_t prev_cache_filter_misses = + TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS); + prev_cache_filter_hits = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); + Get(1, Key(0)); + ASSERT_EQ(prev_cache_filter_misses, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(prev_cache_filter_hits, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + + // Check filter block ignored for file trivially-moved to bottom level + bbto.block_cache.reset(); + options.max_open_files = 100; // setting > -1 makes it not preload all files + options.statistics = CreateDBStatistics(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + ReopenWithColumnFamilies({"default", "mypikachu"}, options); + + ASSERT_OK(Put(1, Key(numkeys + 1), "val")); + ASSERT_OK(Flush(1)); + + int32_t trivial_move = 0; + int32_t non_trivial_move = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* /*arg*/) { trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial", + [&](void* /*arg*/) { non_trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + CompactRangeOptions compact_options; + compact_options.bottommost_level_compaction = + BottommostLevelCompaction::kSkip; + compact_options.change_level = true; + compact_options.target_level = 7; + db_->CompactRange(compact_options, handles_[1], nullptr, nullptr); + + ASSERT_EQ(trivial_move, 1); + ASSERT_EQ(non_trivial_move, 0); + + prev_cache_filter_hits = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); + prev_cache_filter_misses = + TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS); + value = Get(1, Key(numkeys + 1)); + ASSERT_EQ(prev_cache_filter_hits, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(prev_cache_filter_misses, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + + // Check filter block not cached for iterator + bbto.block_cache.reset(); + options.statistics = CreateDBStatistics(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + ReopenWithColumnFamilies({"default", "mypikachu"}, options); + + std::unique_ptr iter(db_->NewIterator(ReadOptions(), handles_[1])); + iter->SeekToFirst(); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(2 /* index and data block */, + TestGetTickerCount(options, BLOCK_CACHE_ADD)); + get_perf_context()->Reset(); +} + +int CountIter(std::unique_ptr& iter, const Slice& key) { + int count = 0; + for (iter->Seek(key); iter->Valid() && iter->status() == Status::OK(); + iter->Next()) { + count++; + } + return count; +} + +// use iterate_upper_bound to hint compatiability of existing bloom filters. +// The BF is considered compatible if 1) upper bound and seek key transform +// into the same string, or 2) the transformed seek key is of the same length +// as the upper bound and two keys are adjacent according to the comparator. +TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) { + for (auto bfp_impl : BFP::kAllFixedImpls) { + int using_full_builder = bfp_impl != BFP::kDeprecatedBlock; + Options options; + options.create_if_missing = true; + options.prefix_extractor.reset(NewCappedPrefixTransform(4)); + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + // Enable prefix bloom for SST files + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy.reset(new BFP(10, bfp_impl)); + table_options.index_shortening = BlockBasedTableOptions:: + IndexShorteningMode::kShortenSeparatorsAndSuccessor; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + ASSERT_OK(Put("abcdxxx0", "val1")); + ASSERT_OK(Put("abcdxxx1", "val2")); + ASSERT_OK(Put("abcdxxx2", "val3")); + ASSERT_OK(Put("abcdxxx3", "val4")); + dbfull()->Flush(FlushOptions()); + { + // prefix_extractor has not changed, BF will always be read + Slice upper_bound("abce"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abcd0000"), 4); + } + { + Slice upper_bound("abcdzzzz"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abcd0000"), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:5"}})); + ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(), + "rocksdb.FixedPrefix.5")); + { + // BF changed, [abcdxx00, abce) is a valid bound, will trigger BF read + Slice upper_bound("abce"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abcdxx00"), 4); + // should check bloom filter since upper bound meets requirement + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 2 + using_full_builder); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + { + // [abcdxx01, abcey) is not valid bound since upper bound is too long for + // the BF in SST (capped:4) + Slice upper_bound("abcey"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abcdxx01"), 4); + // should skip bloom filter since upper bound is too long + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 2 + using_full_builder); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + { + // [abcdxx02, abcdy) is a valid bound since the prefix is the same + Slice upper_bound("abcdy"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abcdxx02"), 4); + // should check bloom filter since upper bound matches transformed seek + // key + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 2 + using_full_builder * 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + { + // [aaaaaaaa, abce) is not a valid bound since 1) they don't share the + // same prefix, 2) the prefixes are not consecutive + Slice upper_bound("abce"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "aaaaaaaa"), 0); + // should skip bloom filter since mismatch is found + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 2 + using_full_builder * 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:3"}})); + { + // [abc, abd) is not a valid bound since the upper bound is too short + // for BF (capped:4) + Slice upper_bound("abd"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abc"), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 2 + using_full_builder * 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:4"}})); + { + // set back to capped:4 and verify BF is always read + Slice upper_bound("abd"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abc"), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 3 + using_full_builder * 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + } + } +} + +// Create multiple SST files each with a different prefix_extractor config, +// verify iterators can read all SST files using the latest config. +TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) { + for (auto bfp_impl : BFP::kAllFixedImpls) { + int using_full_builder = bfp_impl != BFP::kDeprecatedBlock; + Options options; + options.create_if_missing = true; + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + // Enable prefix bloom for SST files + BlockBasedTableOptions table_options; + table_options.filter_policy.reset(new BFP(10, bfp_impl)); + table_options.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + Slice upper_bound("foz90000"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + + // first SST with fixed:1 BF + ASSERT_OK(Put("foo2", "bar2")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("foq1", "bar1")); + ASSERT_OK(Put("fpa", "0")); + dbfull()->Flush(FlushOptions()); + std::unique_ptr iter_old(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter_old, "foo"), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 1); + + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}})); + ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(), + "rocksdb.CappedPrefix.3")); + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "foo"), 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 1 + using_full_builder); + ASSERT_EQ(CountIter(iter, "gpk"), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 1 + using_full_builder); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + + // second SST with capped:3 BF + ASSERT_OK(Put("foo3", "bar3")); + ASSERT_OK(Put("foo4", "bar4")); + ASSERT_OK(Put("foq5", "bar5")); + ASSERT_OK(Put("fpb", "1")); + dbfull()->Flush(FlushOptions()); + { + // BF is cappped:3 now + std::unique_ptr iter_tmp(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter_tmp, "foo"), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 2 + using_full_builder * 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0); + // both counters are incremented because BF is "not changed" for 1 of the + // 2 SST files, so filter is checked once and found no match. + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 3 + using_full_builder * 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + } + + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:2"}})); + ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(), + "rocksdb.FixedPrefix.2")); + // third SST with fixed:2 BF + ASSERT_OK(Put("foo6", "bar6")); + ASSERT_OK(Put("foo7", "bar7")); + ASSERT_OK(Put("foq8", "bar8")); + ASSERT_OK(Put("fpc", "2")); + dbfull()->Flush(FlushOptions()); + { + // BF is fixed:2 now + std::unique_ptr iter_tmp(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter_tmp, "foo"), 9); + // the first and last BF are checked + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 4 + using_full_builder * 3); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0); + // only last BF is checked and not found + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 5 + using_full_builder * 3); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2); + } + + // iter_old can only see the first SST, so checked plus 1 + ASSERT_EQ(CountIter(iter_old, "foo"), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 6 + using_full_builder * 3); + // iter was created after the first setoptions call so only full filter + // will check the filter + ASSERT_EQ(CountIter(iter, "foo"), 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 6 + using_full_builder * 4); + + { + // keys in all three SSTs are visible to iterator + // The range of [foo, foz90000] is compatible with (fixed:1) and (fixed:2) + // so +2 for checked counter + std::unique_ptr iter_all(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter_all, "foo"), 9); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 7 + using_full_builder * 5); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2); + ASSERT_EQ(CountIter(iter_all, "gpk"), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 8 + using_full_builder * 5); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3); + } + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}})); + ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(), + "rocksdb.CappedPrefix.3")); + { + std::unique_ptr iter_all(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter_all, "foo"), 6); + // all three SST are checked because the current options has the same as + // the remaining SST (capped:3) + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 9 + using_full_builder * 7); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3); + ASSERT_EQ(CountIter(iter_all, "gpk"), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 10 + using_full_builder * 7); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 4); + } + // TODO(Zhongyi): Maybe also need to add Get calls to test point look up? + } +} + +// Create a new column family in a running DB, change prefix_extractor +// dynamically, verify the iterator created on the new column family behaves +// as expected +TEST_F(DBBloomFilterTest, DynamicBloomFilterNewColumnFamily) { + int iteration = 0; + for (auto bfp_impl : BFP::kAllFixedImpls) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + // Enable prefix bloom for SST files + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy.reset(new BFP(10, bfp_impl)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"pikachu" + std::to_string(iteration)}, options); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + // create a new CF and set prefix_extractor dynamically + options.prefix_extractor.reset(NewCappedPrefixTransform(3)); + CreateColumnFamilies({"ramen_dojo_" + std::to_string(iteration)}, options); + ASSERT_EQ(0, + strcmp(dbfull()->GetOptions(handles_[2]).prefix_extractor->Name(), + "rocksdb.CappedPrefix.3")); + ASSERT_OK(Put(2, "foo3", "bar3")); + ASSERT_OK(Put(2, "foo4", "bar4")); + ASSERT_OK(Put(2, "foo5", "bar5")); + ASSERT_OK(Put(2, "foq6", "bar6")); + ASSERT_OK(Put(2, "fpq7", "bar7")); + dbfull()->Flush(FlushOptions()); + { + std::unique_ptr iter( + db_->NewIterator(read_options, handles_[2])); + ASSERT_EQ(CountIter(iter, "foo"), 3); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + ASSERT_OK( + dbfull()->SetOptions(handles_[2], {{"prefix_extractor", "fixed:2"}})); + ASSERT_EQ(0, + strcmp(dbfull()->GetOptions(handles_[2]).prefix_extractor->Name(), + "rocksdb.FixedPrefix.2")); + { + std::unique_ptr iter( + db_->NewIterator(read_options, handles_[2])); + ASSERT_EQ(CountIter(iter, "foo"), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + ASSERT_OK(dbfull()->DropColumnFamily(handles_[2])); + dbfull()->DestroyColumnFamilyHandle(handles_[2]); + handles_[2] = nullptr; + ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); + dbfull()->DestroyColumnFamilyHandle(handles_[1]); + handles_[1] = nullptr; + iteration++; + } +} + +// Verify it's possible to change prefix_extractor at runtime and iterators +// behaves as expected +TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) { + for (auto bfp_impl : BFP::kAllFixedImpls) { + Options options; + options.create_if_missing = true; + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + // Enable prefix bloom for SST files + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy.reset(new BFP(10, bfp_impl)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + ASSERT_OK(Put("foo2", "bar2")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("foo1", "bar1")); + ASSERT_OK(Put("fpa", "0")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("foo3", "bar3")); + ASSERT_OK(Put("foo4", "bar4")); + ASSERT_OK(Put("foo5", "bar5")); + ASSERT_OK(Put("fpb", "1")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("foo6", "bar6")); + ASSERT_OK(Put("foo7", "bar7")); + ASSERT_OK(Put("foo8", "bar8")); + ASSERT_OK(Put("fpc", "2")); + dbfull()->Flush(FlushOptions()); + + ReadOptions read_options; + read_options.prefix_same_as_start = true; + { + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "foo"), 12); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 3); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + std::unique_ptr iter_old(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter_old, "foo"), 12); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 6); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}})); + ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(), + "rocksdb.CappedPrefix.3")); + { + std::unique_ptr iter(db_->NewIterator(read_options)); + // "fp*" should be skipped + ASSERT_EQ(CountIter(iter, "foo"), 9); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 6); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + + // iterator created before should not be affected and see all keys + ASSERT_EQ(CountIter(iter_old, "foo"), 12); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 9); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + ASSERT_EQ(CountIter(iter_old, "abc"), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 12); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3); + } +} + +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_compaction_filter_test.cc b/src/rocksdb/db/db_compaction_filter_test.cc new file mode 100644 index 000000000..a708c0b1a --- /dev/null +++ b/src/rocksdb/db/db_compaction_filter_test.cc @@ -0,0 +1,872 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_test_util.h" +#include "port/stack_trace.h" + +namespace ROCKSDB_NAMESPACE { + +static int cfilter_count = 0; +static int cfilter_skips = 0; + +// This is a static filter used for filtering +// kvs during the compaction process. +static std::string NEW_VALUE = "NewValue"; + +class DBTestCompactionFilter : public DBTestBase { + public: + DBTestCompactionFilter() : DBTestBase("/db_compaction_filter_test") {} +}; + +// Param variant of DBTestBase::ChangeCompactOptions +class DBTestCompactionFilterWithCompactParam + : public DBTestCompactionFilter, + public ::testing::WithParamInterface { + public: + DBTestCompactionFilterWithCompactParam() : DBTestCompactionFilter() { + option_config_ = GetParam(); + Destroy(last_options_); + auto options = CurrentOptions(); + if (option_config_ == kDefault || option_config_ == kUniversalCompaction || + option_config_ == kUniversalCompactionMultiLevel) { + options.create_if_missing = true; + } + if (option_config_ == kLevelSubcompactions || + option_config_ == kUniversalSubcompactions) { + assert(options.max_subcompactions > 1); + } + TryReopen(options); + } +}; + +#ifndef ROCKSDB_VALGRIND_RUN +INSTANTIATE_TEST_CASE_P( + CompactionFilterWithOption, DBTestCompactionFilterWithCompactParam, + ::testing::Values(DBTestBase::OptionConfig::kDefault, + DBTestBase::OptionConfig::kUniversalCompaction, + DBTestBase::OptionConfig::kUniversalCompactionMultiLevel, + DBTestBase::OptionConfig::kLevelSubcompactions, + DBTestBase::OptionConfig::kUniversalSubcompactions)); +#else +// Run fewer cases in valgrind +INSTANTIATE_TEST_CASE_P(CompactionFilterWithOption, + DBTestCompactionFilterWithCompactParam, + ::testing::Values(DBTestBase::OptionConfig::kDefault)); +#endif // ROCKSDB_VALGRIND_RUN + +class KeepFilter : public CompactionFilter { + public: + bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/, + std::string* /*new_value*/, + bool* /*value_changed*/) const override { + cfilter_count++; + return false; + } + + const char* Name() const override { return "KeepFilter"; } +}; + +class DeleteFilter : public CompactionFilter { + public: + bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/, + std::string* /*new_value*/, + bool* /*value_changed*/) const override { + cfilter_count++; + return true; + } + + const char* Name() const override { return "DeleteFilter"; } +}; + +class DeleteISFilter : public CompactionFilter { + public: + bool Filter(int /*level*/, const Slice& key, const Slice& /*value*/, + std::string* /*new_value*/, + bool* /*value_changed*/) const override { + cfilter_count++; + int i = std::stoi(key.ToString()); + if (i > 5 && i <= 105) { + return true; + } + return false; + } + + bool IgnoreSnapshots() const override { return true; } + + const char* Name() const override { return "DeleteFilter"; } +}; + +// Skip x if floor(x/10) is even, use range skips. Requires that keys are +// zero-padded to length 10. +class SkipEvenFilter : public CompactionFilter { + public: + Decision FilterV2(int /*level*/, const Slice& key, ValueType /*value_type*/, + const Slice& /*existing_value*/, std::string* /*new_value*/, + std::string* skip_until) const override { + cfilter_count++; + int i = std::stoi(key.ToString()); + if (i / 10 % 2 == 0) { + char key_str[100]; + snprintf(key_str, sizeof(key_str), "%010d", i / 10 * 10 + 10); + *skip_until = key_str; + ++cfilter_skips; + return Decision::kRemoveAndSkipUntil; + } + return Decision::kKeep; + } + + bool IgnoreSnapshots() const override { return true; } + + const char* Name() const override { return "DeleteFilter"; } +}; + +class DelayFilter : public CompactionFilter { + public: + explicit DelayFilter(DBTestBase* d) : db_test(d) {} + bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/, + std::string* /*new_value*/, + bool* /*value_changed*/) const override { + db_test->env_->addon_time_.fetch_add(1000); + return true; + } + + const char* Name() const override { return "DelayFilter"; } + + private: + DBTestBase* db_test; +}; + +class ConditionalFilter : public CompactionFilter { + public: + explicit ConditionalFilter(const std::string* filtered_value) + : filtered_value_(filtered_value) {} + bool Filter(int /*level*/, const Slice& /*key*/, const Slice& value, + std::string* /*new_value*/, + bool* /*value_changed*/) const override { + return value.ToString() == *filtered_value_; + } + + const char* Name() const override { return "ConditionalFilter"; } + + private: + const std::string* filtered_value_; +}; + +class ChangeFilter : public CompactionFilter { + public: + explicit ChangeFilter() {} + + bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/, + std::string* new_value, bool* value_changed) const override { + assert(new_value != nullptr); + *new_value = NEW_VALUE; + *value_changed = true; + return false; + } + + const char* Name() const override { return "ChangeFilter"; } +}; + +class KeepFilterFactory : public CompactionFilterFactory { + public: + explicit KeepFilterFactory(bool check_context = false, + bool check_context_cf_id = false) + : check_context_(check_context), + check_context_cf_id_(check_context_cf_id), + compaction_filter_created_(false) {} + + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { + if (check_context_) { + EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction); + EXPECT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction); + } + if (check_context_cf_id_) { + EXPECT_EQ(expect_cf_id_.load(), context.column_family_id); + } + compaction_filter_created_ = true; + return std::unique_ptr(new KeepFilter()); + } + + bool compaction_filter_created() const { return compaction_filter_created_; } + + const char* Name() const override { return "KeepFilterFactory"; } + bool check_context_; + bool check_context_cf_id_; + std::atomic_bool expect_full_compaction_; + std::atomic_bool expect_manual_compaction_; + std::atomic expect_cf_id_; + bool compaction_filter_created_; +}; + +class DeleteFilterFactory : public CompactionFilterFactory { + public: + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { + if (context.is_manual_compaction) { + return std::unique_ptr(new DeleteFilter()); + } else { + return std::unique_ptr(nullptr); + } + } + + const char* Name() const override { return "DeleteFilterFactory"; } +}; + +// Delete Filter Factory which ignores snapshots +class DeleteISFilterFactory : public CompactionFilterFactory { + public: + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { + if (context.is_manual_compaction) { + return std::unique_ptr(new DeleteISFilter()); + } else { + return std::unique_ptr(nullptr); + } + } + + const char* Name() const override { return "DeleteFilterFactory"; } +}; + +class SkipEvenFilterFactory : public CompactionFilterFactory { + public: + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { + if (context.is_manual_compaction) { + return std::unique_ptr(new SkipEvenFilter()); + } else { + return std::unique_ptr(nullptr); + } + } + + const char* Name() const override { return "SkipEvenFilterFactory"; } +}; + +class DelayFilterFactory : public CompactionFilterFactory { + public: + explicit DelayFilterFactory(DBTestBase* d) : db_test(d) {} + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& /*context*/) override { + return std::unique_ptr(new DelayFilter(db_test)); + } + + const char* Name() const override { return "DelayFilterFactory"; } + + private: + DBTestBase* db_test; +}; + +class ConditionalFilterFactory : public CompactionFilterFactory { + public: + explicit ConditionalFilterFactory(const Slice& filtered_value) + : filtered_value_(filtered_value.ToString()) {} + + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& /*context*/) override { + return std::unique_ptr( + new ConditionalFilter(&filtered_value_)); + } + + const char* Name() const override { return "ConditionalFilterFactory"; } + + private: + std::string filtered_value_; +}; + +class ChangeFilterFactory : public CompactionFilterFactory { + public: + explicit ChangeFilterFactory() {} + + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& /*context*/) override { + return std::unique_ptr(new ChangeFilter()); + } + + const char* Name() const override { return "ChangeFilterFactory"; } +}; + +#ifndef ROCKSDB_LITE +TEST_F(DBTestCompactionFilter, CompactionFilter) { + Options options = CurrentOptions(); + options.max_open_files = -1; + options.num_levels = 3; + options.compaction_filter_factory = std::make_shared(); + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Write 100K keys, these are written to a few files in L0. + const std::string value(10, 'x'); + for (int i = 0; i < 100000; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + Put(1, key, value); + } + ASSERT_OK(Flush(1)); + + // Push all files to the highest level L2. Verify that + // the compaction is each level invokes the filter for + // all the keys in that level. + cfilter_count = 0; + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + ASSERT_EQ(cfilter_count, 100000); + cfilter_count = 0; + dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); + ASSERT_EQ(cfilter_count, 100000); + + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0); + ASSERT_NE(NumTableFilesAtLevel(2, 1), 0); + cfilter_count = 0; + + // All the files are in the lowest level. + // Verify that all but the 100001st record + // has sequence number zero. The 100001st record + // is at the tip of this snapshot and cannot + // be zeroed out. + int count = 0; + int total = 0; + Arena arena; + { + InternalKeyComparator icmp(options.comparator); + ReadRangeDelAggregator range_del_agg(&icmp, + kMaxSequenceNumber /* upper_bound */); + ScopedArenaIterator iter(dbfull()->NewInternalIterator( + &arena, &range_del_agg, kMaxSequenceNumber, handles_[1])); + iter->SeekToFirst(); + ASSERT_OK(iter->status()); + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + total++; + if (ikey.sequence != 0) { + count++; + } + iter->Next(); + } + } + ASSERT_EQ(total, 100000); + ASSERT_EQ(count, 0); + + // overwrite all the 100K keys once again. + for (int i = 0; i < 100000; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + ASSERT_OK(Put(1, key, value)); + } + ASSERT_OK(Flush(1)); + + // push all files to the highest level L2. This + // means that all keys should pass at least once + // via the compaction filter + cfilter_count = 0; + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + ASSERT_EQ(cfilter_count, 100000); + cfilter_count = 0; + dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); + ASSERT_EQ(cfilter_count, 100000); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0); + ASSERT_NE(NumTableFilesAtLevel(2, 1), 0); + + // create a new database with the compaction + // filter in such a way that it deletes all keys + options.compaction_filter_factory = std::make_shared(); + options.create_if_missing = true; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // write all the keys once again. + for (int i = 0; i < 100000; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + ASSERT_OK(Put(1, key, value)); + } + ASSERT_OK(Flush(1)); + ASSERT_NE(NumTableFilesAtLevel(0, 1), 0); + ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0); + ASSERT_EQ(NumTableFilesAtLevel(2, 1), 0); + + // Push all files to the highest level L2. This + // triggers the compaction filter to delete all keys, + // verify that at the end of the compaction process, + // nothing is left. + cfilter_count = 0; + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + ASSERT_EQ(cfilter_count, 100000); + cfilter_count = 0; + dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); + ASSERT_EQ(cfilter_count, 0); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0); + + { + // Scan the entire database to ensure that nothing is left + std::unique_ptr iter( + db_->NewIterator(ReadOptions(), handles_[1])); + iter->SeekToFirst(); + count = 0; + while (iter->Valid()) { + count++; + iter->Next(); + } + ASSERT_EQ(count, 0); + } + + // The sequence number of the remaining record + // is not zeroed out even though it is at the + // level Lmax because this record is at the tip + count = 0; + { + InternalKeyComparator icmp(options.comparator); + ReadRangeDelAggregator range_del_agg(&icmp, + kMaxSequenceNumber /* upper_bound */); + ScopedArenaIterator iter(dbfull()->NewInternalIterator( + &arena, &range_del_agg, kMaxSequenceNumber, handles_[1])); + iter->SeekToFirst(); + ASSERT_OK(iter->status()); + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + ASSERT_NE(ikey.sequence, (unsigned)0); + count++; + iter->Next(); + } + ASSERT_EQ(count, 0); + } +} + +// Tests the edge case where compaction does not produce any output -- all +// entries are deleted. The compaction should create bunch of 'DeleteFile' +// entries in VersionEdit, but none of the 'AddFile's. +TEST_F(DBTestCompactionFilter, CompactionFilterDeletesAll) { + Options options = CurrentOptions(); + options.compaction_filter_factory = std::make_shared(); + options.disable_auto_compactions = true; + options.create_if_missing = true; + DestroyAndReopen(options); + + // put some data + for (int table = 0; table < 4; ++table) { + for (int i = 0; i < 10 + table; ++i) { + Put(ToString(table * 100 + i), "val"); + } + Flush(); + } + + // this will produce empty file (delete compaction filter) + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(0U, CountLiveFiles()); + + Reopen(options); + + Iterator* itr = db_->NewIterator(ReadOptions()); + itr->SeekToFirst(); + // empty db + ASSERT_TRUE(!itr->Valid()); + + delete itr; +} +#endif // ROCKSDB_LITE + +TEST_P(DBTestCompactionFilterWithCompactParam, + CompactionFilterWithValueChange) { + Options options = CurrentOptions(); + options.num_levels = 3; + options.compaction_filter_factory = std::make_shared(); + CreateAndReopenWithCF({"pikachu"}, options); + + // Write 100K+1 keys, these are written to a few files + // in L0. We do this so that the current snapshot points + // to the 100001 key.The compaction filter is not invoked + // on keys that are visible via a snapshot because we + // anyways cannot delete it. + const std::string value(10, 'x'); + for (int i = 0; i < 100001; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + Put(1, key, value); + } + + // push all files to lower levels + ASSERT_OK(Flush(1)); + if (option_config_ != kUniversalCompactionMultiLevel && + option_config_ != kUniversalSubcompactions) { + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); + } else { + dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr); + } + + // re-write all data again + for (int i = 0; i < 100001; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + Put(1, key, value); + } + + // push all files to lower levels. This should + // invoke the compaction filter for all 100000 keys. + ASSERT_OK(Flush(1)); + if (option_config_ != kUniversalCompactionMultiLevel && + option_config_ != kUniversalSubcompactions) { + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); + } else { + dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr); + } + + // verify that all keys now have the new value that + // was set by the compaction process. + for (int i = 0; i < 100001; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + std::string newvalue = Get(1, key); + ASSERT_EQ(newvalue.compare(NEW_VALUE), 0); + } +} + +TEST_F(DBTestCompactionFilter, CompactionFilterWithMergeOperator) { + std::string one, two, three, four; + PutFixed64(&one, 1); + PutFixed64(&two, 2); + PutFixed64(&three, 3); + PutFixed64(&four, 4); + + Options options = CurrentOptions(); + options.create_if_missing = true; + options.merge_operator = MergeOperators::CreateUInt64AddOperator(); + options.num_levels = 3; + // Filter out keys with value is 2. + options.compaction_filter_factory = + std::make_shared(two); + DestroyAndReopen(options); + + // In the same compaction, a value type needs to be deleted based on + // compaction filter, and there is a merge type for the key. compaction + // filter result is ignored. + ASSERT_OK(db_->Put(WriteOptions(), "foo", two)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->Merge(WriteOptions(), "foo", one)); + ASSERT_OK(Flush()); + std::string newvalue = Get("foo"); + ASSERT_EQ(newvalue, three); + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + newvalue = Get("foo"); + ASSERT_EQ(newvalue, three); + + // value key can be deleted based on compaction filter, leaving only + // merge keys. + ASSERT_OK(db_->Put(WriteOptions(), "bar", two)); + ASSERT_OK(Flush()); + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + newvalue = Get("bar"); + ASSERT_EQ("NOT_FOUND", newvalue); + ASSERT_OK(db_->Merge(WriteOptions(), "bar", two)); + ASSERT_OK(Flush()); + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + newvalue = Get("bar"); + ASSERT_EQ(two, two); + + // Compaction filter never applies to merge keys. + ASSERT_OK(db_->Put(WriteOptions(), "foobar", one)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->Merge(WriteOptions(), "foobar", two)); + ASSERT_OK(Flush()); + newvalue = Get("foobar"); + ASSERT_EQ(newvalue, three); + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + newvalue = Get("foobar"); + ASSERT_EQ(newvalue, three); + + // In the same compaction, both of value type and merge type keys need to be + // deleted based on compaction filter, and there is a merge type for the key. + // For both keys, compaction filter results are ignored. + ASSERT_OK(db_->Put(WriteOptions(), "barfoo", two)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->Merge(WriteOptions(), "barfoo", two)); + ASSERT_OK(Flush()); + newvalue = Get("barfoo"); + ASSERT_EQ(newvalue, four); + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + newvalue = Get("barfoo"); + ASSERT_EQ(newvalue, four); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBTestCompactionFilter, CompactionFilterContextManual) { + KeepFilterFactory* filter = new KeepFilterFactory(true, true); + + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.compaction_filter_factory.reset(filter); + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = 8; + Reopen(options); + int num_keys_per_file = 400; + for (int j = 0; j < 3; j++) { + // Write several keys. + const std::string value(10, 'x'); + for (int i = 0; i < num_keys_per_file; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%08d%02d", i, j); + Put(key, value); + } + dbfull()->TEST_FlushMemTable(); + // Make sure next file is much smaller so automatic compaction will not + // be triggered. + num_keys_per_file /= 2; + } + dbfull()->TEST_WaitForCompact(); + + // Force a manual compaction + cfilter_count = 0; + filter->expect_manual_compaction_.store(true); + filter->expect_full_compaction_.store(true); + filter->expect_cf_id_.store(0); + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_EQ(cfilter_count, 700); + ASSERT_EQ(NumSortedRuns(0), 1); + ASSERT_TRUE(filter->compaction_filter_created()); + + // Verify total number of keys is correct after manual compaction. + { + int count = 0; + int total = 0; + Arena arena; + InternalKeyComparator icmp(options.comparator); + ReadRangeDelAggregator range_del_agg(&icmp, + kMaxSequenceNumber /* snapshots */); + ScopedArenaIterator iter(dbfull()->NewInternalIterator( + &arena, &range_del_agg, kMaxSequenceNumber)); + iter->SeekToFirst(); + ASSERT_OK(iter->status()); + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + total++; + if (ikey.sequence != 0) { + count++; + } + iter->Next(); + } + ASSERT_EQ(total, 700); + ASSERT_EQ(count, 0); + } +} +#endif // ROCKSDB_LITE + +TEST_F(DBTestCompactionFilter, CompactionFilterContextCfId) { + KeepFilterFactory* filter = new KeepFilterFactory(false, true); + filter->expect_cf_id_.store(1); + + Options options = CurrentOptions(); + options.compaction_filter_factory.reset(filter); + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = 2; + CreateAndReopenWithCF({"pikachu"}, options); + + int num_keys_per_file = 400; + for (int j = 0; j < 3; j++) { + // Write several keys. + const std::string value(10, 'x'); + for (int i = 0; i < num_keys_per_file; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%08d%02d", i, j); + Put(1, key, value); + } + Flush(1); + // Make sure next file is much smaller so automatic compaction will not + // be triggered. + num_keys_per_file /= 2; + } + dbfull()->TEST_WaitForCompact(); + + ASSERT_TRUE(filter->compaction_filter_created()); +} + +#ifndef ROCKSDB_LITE +// Compaction filters aplies to all records, regardless snapshots. +TEST_F(DBTestCompactionFilter, CompactionFilterIgnoreSnapshot) { + std::string five = ToString(5); + Options options = CurrentOptions(); + options.compaction_filter_factory = std::make_shared(); + options.disable_auto_compactions = true; + options.create_if_missing = true; + DestroyAndReopen(options); + + // Put some data. + const Snapshot* snapshot = nullptr; + for (int table = 0; table < 4; ++table) { + for (int i = 0; i < 10; ++i) { + Put(ToString(table * 100 + i), "val"); + } + Flush(); + + if (table == 0) { + snapshot = db_->GetSnapshot(); + } + } + assert(snapshot != nullptr); + + cfilter_count = 0; + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + // The filter should delete 40 records. + ASSERT_EQ(40, cfilter_count); + + { + // Scan the entire database as of the snapshot to ensure + // that nothing is left + ReadOptions read_options; + read_options.snapshot = snapshot; + std::unique_ptr iter(db_->NewIterator(read_options)); + iter->SeekToFirst(); + int count = 0; + while (iter->Valid()) { + count++; + iter->Next(); + } + ASSERT_EQ(count, 6); + read_options.snapshot = nullptr; + std::unique_ptr iter1(db_->NewIterator(read_options)); + iter1->SeekToFirst(); + count = 0; + while (iter1->Valid()) { + count++; + iter1->Next(); + } + // We have deleted 10 keys from 40 using the compaction filter + // Keys 6-9 before the snapshot and 100-105 after the snapshot + ASSERT_EQ(count, 30); + } + + // Release the snapshot and compact again -> now all records should be + // removed. + db_->ReleaseSnapshot(snapshot); +} +#endif // ROCKSDB_LITE + +TEST_F(DBTestCompactionFilter, SkipUntil) { + Options options = CurrentOptions(); + options.compaction_filter_factory = std::make_shared(); + options.disable_auto_compactions = true; + options.create_if_missing = true; + DestroyAndReopen(options); + + // Write 100K keys, these are written to a few files in L0. + for (int table = 0; table < 4; ++table) { + // Key ranges in tables are [0, 38], [106, 149], [212, 260], [318, 371]. + for (int i = table * 6; i < 39 + table * 11; ++i) { + char key[100]; + snprintf(key, sizeof(key), "%010d", table * 100 + i); + Put(key, std::to_string(table * 1000 + i)); + } + Flush(); + } + + cfilter_skips = 0; + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + // Number of skips in tables: 2, 3, 3, 3. + ASSERT_EQ(11, cfilter_skips); + + for (int table = 0; table < 4; ++table) { + for (int i = table * 6; i < 39 + table * 11; ++i) { + int k = table * 100 + i; + char key[100]; + snprintf(key, sizeof(key), "%010d", table * 100 + i); + auto expected = std::to_string(table * 1000 + i); + std::string val; + Status s = db_->Get(ReadOptions(), key, &val); + if (k / 10 % 2 == 0) { + ASSERT_TRUE(s.IsNotFound()); + } else { + ASSERT_OK(s); + ASSERT_EQ(expected, val); + } + } + } +} + +TEST_F(DBTestCompactionFilter, SkipUntilWithBloomFilter) { + BlockBasedTableOptions table_options; + table_options.whole_key_filtering = false; + table_options.filter_policy.reset(NewBloomFilterPolicy(100, false)); + + Options options = CurrentOptions(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(NewCappedPrefixTransform(9)); + options.compaction_filter_factory = std::make_shared(); + options.disable_auto_compactions = true; + options.create_if_missing = true; + DestroyAndReopen(options); + + Put("0000000010", "v10"); + Put("0000000020", "v20"); // skipped + Put("0000000050", "v50"); + Flush(); + + cfilter_skips = 0; + EXPECT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + EXPECT_EQ(1, cfilter_skips); + + Status s; + std::string val; + + s = db_->Get(ReadOptions(), "0000000010", &val); + ASSERT_OK(s); + EXPECT_EQ("v10", val); + + s = db_->Get(ReadOptions(), "0000000020", &val); + EXPECT_TRUE(s.IsNotFound()); + + s = db_->Get(ReadOptions(), "0000000050", &val); + ASSERT_OK(s); + EXPECT_EQ("v50", val); +} + +class TestNotSupportedFilter : public CompactionFilter { + public: + bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/, + std::string* /*new_value*/, + bool* /*value_changed*/) const override { + return true; + } + + const char* Name() const override { return "NotSupported"; } + bool IgnoreSnapshots() const override { return false; } +}; + +TEST_F(DBTestCompactionFilter, IgnoreSnapshotsFalse) { + Options options = CurrentOptions(); + options.compaction_filter = new TestNotSupportedFilter(); + DestroyAndReopen(options); + + Put("a", "v10"); + Put("z", "v20"); + Flush(); + + Put("a", "v10"); + Put("z", "v20"); + Flush(); + + // Comapction should fail because IgnoreSnapshots() = false + EXPECT_TRUE(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr) + .IsNotSupported()); + + delete options.compaction_filter; +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_compaction_test.cc b/src/rocksdb/db/db_compaction_test.cc new file mode 100644 index 000000000..635aca135 --- /dev/null +++ b/src/rocksdb/db/db_compaction_test.cc @@ -0,0 +1,5167 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_test_util.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/concurrent_task_limiter.h" +#include "rocksdb/experimental.h" +#include "rocksdb/sst_file_writer.h" +#include "rocksdb/utilities/convenience.h" +#include "test_util/fault_injection_test_env.h" +#include "test_util/sync_point.h" +#include "util/concurrent_task_limiter_impl.h" + +namespace ROCKSDB_NAMESPACE { + +// SYNC_POINT is not supported in released Windows mode. +#if !defined(ROCKSDB_LITE) + +class DBCompactionTest : public DBTestBase { + public: + DBCompactionTest() : DBTestBase("/db_compaction_test") {} +}; + +class DBCompactionTestWithParam + : public DBTestBase, + public testing::WithParamInterface> { + public: + DBCompactionTestWithParam() : DBTestBase("/db_compaction_test") { + max_subcompactions_ = std::get<0>(GetParam()); + exclusive_manual_compaction_ = std::get<1>(GetParam()); + } + + // Required if inheriting from testing::WithParamInterface<> + static void SetUpTestCase() {} + static void TearDownTestCase() {} + + uint32_t max_subcompactions_; + bool exclusive_manual_compaction_; +}; + +class DBCompactionDirectIOTest : public DBCompactionTest, + public ::testing::WithParamInterface { + public: + DBCompactionDirectIOTest() : DBCompactionTest() {} +}; + +namespace { + +class FlushedFileCollector : public EventListener { + public: + FlushedFileCollector() {} + ~FlushedFileCollector() override {} + + void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override { + std::lock_guard lock(mutex_); + flushed_files_.push_back(info.file_path); + } + + std::vector GetFlushedFiles() { + std::lock_guard lock(mutex_); + std::vector result; + for (auto fname : flushed_files_) { + result.push_back(fname); + } + return result; + } + + void ClearFlushedFiles() { flushed_files_.clear(); } + + private: + std::vector flushed_files_; + std::mutex mutex_; +}; + +class CompactionStatsCollector : public EventListener { +public: + CompactionStatsCollector() + : compaction_completed_(static_cast(CompactionReason::kNumOfReasons)) { + for (auto& v : compaction_completed_) { + v.store(0); + } + } + + ~CompactionStatsCollector() override {} + + void OnCompactionCompleted(DB* /* db */, + const CompactionJobInfo& info) override { + int k = static_cast(info.compaction_reason); + int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); + assert(k >= 0 && k < num_of_reasons); + compaction_completed_[k]++; + } + + void OnExternalFileIngested( + DB* /* db */, const ExternalFileIngestionInfo& /* info */) override { + int k = static_cast(CompactionReason::kExternalSstIngestion); + compaction_completed_[k]++; + } + + void OnFlushCompleted(DB* /* db */, const FlushJobInfo& /* info */) override { + int k = static_cast(CompactionReason::kFlush); + compaction_completed_[k]++; + } + + int NumberOfCompactions(CompactionReason reason) const { + int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); + int k = static_cast(reason); + assert(k >= 0 && k < num_of_reasons); + return compaction_completed_.at(k).load(); + } + +private: + std::vector> compaction_completed_; +}; + +class SstStatsCollector : public EventListener { + public: + SstStatsCollector() : num_ssts_creation_started_(0) {} + + void OnTableFileCreationStarted( + const TableFileCreationBriefInfo& /* info */) override { + ++num_ssts_creation_started_; + } + + int num_ssts_creation_started() { return num_ssts_creation_started_; } + + private: + std::atomic num_ssts_creation_started_; +}; + +static const int kCDTValueSize = 1000; +static const int kCDTKeysPerBuffer = 4; +static const int kCDTNumLevels = 8; +Options DeletionTriggerOptions(Options options) { + options.compression = kNoCompression; + options.write_buffer_size = kCDTKeysPerBuffer * (kCDTValueSize + 24); + options.min_write_buffer_number_to_merge = 1; + options.max_write_buffer_size_to_maintain = 0; + options.num_levels = kCDTNumLevels; + options.level0_file_num_compaction_trigger = 1; + options.target_file_size_base = options.write_buffer_size * 2; + options.target_file_size_multiplier = 2; + options.max_bytes_for_level_base = + options.target_file_size_base * options.target_file_size_multiplier; + options.max_bytes_for_level_multiplier = 2; + options.disable_auto_compactions = false; + return options; +} + +bool HaveOverlappingKeyRanges( + const Comparator* c, + const SstFileMetaData& a, const SstFileMetaData& b) { + if (c->Compare(a.smallestkey, b.smallestkey) >= 0) { + if (c->Compare(a.smallestkey, b.largestkey) <= 0) { + // b.smallestkey <= a.smallestkey <= b.largestkey + return true; + } + } else if (c->Compare(a.largestkey, b.smallestkey) >= 0) { + // a.smallestkey < b.smallestkey <= a.largestkey + return true; + } + if (c->Compare(a.largestkey, b.largestkey) <= 0) { + if (c->Compare(a.largestkey, b.smallestkey) >= 0) { + // b.smallestkey <= a.largestkey <= b.largestkey + return true; + } + } else if (c->Compare(a.smallestkey, b.largestkey) <= 0) { + // a.smallestkey <= b.largestkey < a.largestkey + return true; + } + return false; +} + +// Identifies all files between level "min_level" and "max_level" +// which has overlapping key range with "input_file_meta". +void GetOverlappingFileNumbersForLevelCompaction( + const ColumnFamilyMetaData& cf_meta, + const Comparator* comparator, + int min_level, int max_level, + const SstFileMetaData* input_file_meta, + std::set* overlapping_file_names) { + std::set overlapping_files; + overlapping_files.insert(input_file_meta); + for (int m = min_level; m <= max_level; ++m) { + for (auto& file : cf_meta.levels[m].files) { + for (auto* included_file : overlapping_files) { + if (HaveOverlappingKeyRanges( + comparator, *included_file, file)) { + overlapping_files.insert(&file); + overlapping_file_names->insert(file.name); + break; + } + } + } + } +} + +void VerifyCompactionResult( + const ColumnFamilyMetaData& cf_meta, + const std::set& overlapping_file_numbers) { +#ifndef NDEBUG + for (auto& level : cf_meta.levels) { + for (auto& file : level.files) { + assert(overlapping_file_numbers.find(file.name) == + overlapping_file_numbers.end()); + } + } +#endif +} + +/* + * Verifies compaction stats of cfd are valid. + * + * For each level of cfd, its compaction stats are valid if + * 1) sum(stat.counts) == stat.count, and + * 2) stat.counts[i] == collector.NumberOfCompactions(i) + */ +void VerifyCompactionStats(ColumnFamilyData& cfd, + const CompactionStatsCollector& collector) { +#ifndef NDEBUG + InternalStats* internal_stats_ptr = cfd.internal_stats(); + ASSERT_TRUE(internal_stats_ptr != nullptr); + const std::vector& comp_stats = + internal_stats_ptr->TEST_GetCompactionStats(); + const int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); + std::vector counts(num_of_reasons, 0); + // Count the number of compactions caused by each CompactionReason across + // all levels. + for (const auto& stat : comp_stats) { + int sum = 0; + for (int i = 0; i < num_of_reasons; i++) { + counts[i] += stat.counts[i]; + sum += stat.counts[i]; + } + ASSERT_EQ(sum, stat.count); + } + // Verify InternalStats bookkeeping matches that of CompactionStatsCollector, + // assuming that all compactions complete. + for (int i = 0; i < num_of_reasons; i++) { + ASSERT_EQ(collector.NumberOfCompactions(static_cast(i)), counts[i]); + } +#endif /* NDEBUG */ +} + +const SstFileMetaData* PickFileRandomly( + const ColumnFamilyMetaData& cf_meta, + Random* rand, + int* level = nullptr) { + auto file_id = rand->Uniform(static_cast( + cf_meta.file_count)) + 1; + for (auto& level_meta : cf_meta.levels) { + if (file_id <= level_meta.files.size()) { + if (level != nullptr) { + *level = level_meta.level; + } + auto result = rand->Uniform(file_id); + return &(level_meta.files[result]); + } + file_id -= static_cast(level_meta.files.size()); + } + assert(false); + return nullptr; +} +} // anonymous namespace + +#ifndef ROCKSDB_VALGRIND_RUN +// All the TEST_P tests run once with sub_compactions disabled (i.e. +// options.max_subcompactions = 1) and once with it enabled +TEST_P(DBCompactionTestWithParam, CompactionDeletionTrigger) { + for (int tid = 0; tid < 3; ++tid) { + uint64_t db_size[2]; + Options options = DeletionTriggerOptions(CurrentOptions()); + options.max_subcompactions = max_subcompactions_; + + if (tid == 1) { + // the following only disable stats update in DB::Open() + // and should not affect the result of this test. + options.skip_stats_update_on_db_open = true; + } else if (tid == 2) { + // third pass with universal compaction + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 1; + } + + DestroyAndReopen(options); + Random rnd(301); + + const int kTestSize = kCDTKeysPerBuffer * 1024; + std::vector values; + for (int k = 0; k < kTestSize; ++k) { + values.push_back(RandomString(&rnd, kCDTValueSize)); + ASSERT_OK(Put(Key(k), values[k])); + } + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + db_size[0] = Size(Key(0), Key(kTestSize - 1)); + + for (int k = 0; k < kTestSize; ++k) { + ASSERT_OK(Delete(Key(k))); + } + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + db_size[1] = Size(Key(0), Key(kTestSize - 1)); + + // must have much smaller db size. + ASSERT_GT(db_size[0] / 3, db_size[1]); + } +} +#endif // ROCKSDB_VALGRIND_RUN + +TEST_P(DBCompactionTestWithParam, CompactionsPreserveDeletes) { + // For each options type we test following + // - Enable preserve_deletes + // - write bunch of keys and deletes + // - Set start_seqnum to the beginning; compact; check that keys are present + // - rewind start_seqnum way forward; compact; check that keys are gone + + for (int tid = 0; tid < 3; ++tid) { + Options options = DeletionTriggerOptions(CurrentOptions()); + options.max_subcompactions = max_subcompactions_; + options.preserve_deletes=true; + options.num_levels = 2; + + if (tid == 1) { + options.skip_stats_update_on_db_open = true; + } else if (tid == 2) { + // third pass with universal compaction + options.compaction_style = kCompactionStyleUniversal; + } + + DestroyAndReopen(options); + Random rnd(301); + // highlight the default; all deletes should be preserved + SetPreserveDeletesSequenceNumber(0); + + const int kTestSize = kCDTKeysPerBuffer; + std::vector values; + for (int k = 0; k < kTestSize; ++k) { + values.push_back(RandomString(&rnd, kCDTValueSize)); + ASSERT_OK(Put(Key(k), values[k])); + } + + for (int k = 0; k < kTestSize; ++k) { + ASSERT_OK(Delete(Key(k))); + } + // to ensure we tackle all tombstones + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 2; + cro.bottommost_level_compaction = + BottommostLevelCompaction::kForceOptimized; + + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->CompactRange(cro, nullptr, nullptr); + + // check that normal user iterator doesn't see anything + Iterator* db_iter = dbfull()->NewIterator(ReadOptions()); + int i = 0; + for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) { + i++; + } + ASSERT_EQ(i, 0); + delete db_iter; + + // check that iterator that sees internal keys sees tombstones + ReadOptions ro; + ro.iter_start_seqnum=1; + db_iter = dbfull()->NewIterator(ro); + i = 0; + for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) { + i++; + } + ASSERT_EQ(i, 4); + delete db_iter; + + // now all deletes should be gone + SetPreserveDeletesSequenceNumber(100000000); + dbfull()->CompactRange(cro, nullptr, nullptr); + + db_iter = dbfull()->NewIterator(ro); + i = 0; + for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) { + i++; + } + ASSERT_EQ(i, 0); + delete db_iter; + } +} + +TEST_F(DBCompactionTest, SkipStatsUpdateTest) { + // This test verify UpdateAccumulatedStats is not on + // if options.skip_stats_update_on_db_open = true + // The test will need to be updated if the internal behavior changes. + + Options options = DeletionTriggerOptions(CurrentOptions()); + options.disable_auto_compactions = true; + options.env = env_; + DestroyAndReopen(options); + Random rnd(301); + + const int kTestSize = kCDTKeysPerBuffer * 512; + std::vector values; + for (int k = 0; k < kTestSize; ++k) { + values.push_back(RandomString(&rnd, kCDTValueSize)); + ASSERT_OK(Put(Key(k), values[k])); + } + + ASSERT_OK(Flush()); + + Close(); + + int update_acc_stats_called = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "VersionStorageInfo::UpdateAccumulatedStats", + [&](void* /* arg */) { ++update_acc_stats_called; }); + SyncPoint::GetInstance()->EnableProcessing(); + + // Reopen the DB with stats-update disabled + options.skip_stats_update_on_db_open = true; + options.max_open_files = 20; + Reopen(options); + + ASSERT_EQ(update_acc_stats_called, 0); + + // Repeat the reopen process, but this time we enable + // stats-update. + options.skip_stats_update_on_db_open = false; + Reopen(options); + + ASSERT_GT(update_acc_stats_called, 0); + + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBCompactionTest, TestTableReaderForCompaction) { + Options options = CurrentOptions(); + options.env = env_; + options.new_table_reader_for_compaction_inputs = true; + options.max_open_files = 20; + options.level0_file_num_compaction_trigger = 3; + DestroyAndReopen(options); + Random rnd(301); + + int num_table_cache_lookup = 0; + int num_new_table_reader = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "TableCache::FindTable:0", [&](void* arg) { + assert(arg != nullptr); + bool no_io = *(reinterpret_cast(arg)); + if (!no_io) { + // filter out cases for table properties queries. + num_table_cache_lookup++; + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "TableCache::GetTableReader:0", + [&](void* /*arg*/) { num_new_table_reader++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + for (int k = 0; k < options.level0_file_num_compaction_trigger; ++k) { + ASSERT_OK(Put(Key(k), Key(k))); + ASSERT_OK(Put(Key(10 - k), "bar")); + if (k < options.level0_file_num_compaction_trigger - 1) { + num_table_cache_lookup = 0; + Flush(); + dbfull()->TEST_WaitForCompact(); + // preloading iterator issues one table cache lookup and create + // a new table reader, if not preloaded. + int old_num_table_cache_lookup = num_table_cache_lookup; + ASSERT_GE(num_table_cache_lookup, 1); + ASSERT_EQ(num_new_table_reader, 1); + + num_table_cache_lookup = 0; + num_new_table_reader = 0; + ASSERT_EQ(Key(k), Get(Key(k))); + // lookup iterator from table cache and no need to create a new one. + ASSERT_EQ(old_num_table_cache_lookup + num_table_cache_lookup, 2); + ASSERT_EQ(num_new_table_reader, 0); + } + } + + num_table_cache_lookup = 0; + num_new_table_reader = 0; + Flush(); + dbfull()->TEST_WaitForCompact(); + // Preloading iterator issues one table cache lookup and creates + // a new table reader. One file is created for flush and one for compaction. + // Compaction inputs make no table cache look-up for data/range deletion + // iterators + // May preload table cache too. + ASSERT_GE(num_table_cache_lookup, 2); + int old_num_table_cache_lookup2 = num_table_cache_lookup; + + // Create new iterator for: + // (1) 1 for verifying flush results + // (2) 1 for verifying compaction results. + // (3) New TableReaders will not be created for compaction inputs + ASSERT_EQ(num_new_table_reader, 2); + + num_table_cache_lookup = 0; + num_new_table_reader = 0; + ASSERT_EQ(Key(1), Get(Key(1))); + ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 5); + ASSERT_EQ(num_new_table_reader, 0); + + num_table_cache_lookup = 0; + num_new_table_reader = 0; + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 2; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; + db_->CompactRange(cro, nullptr, nullptr); + // Only verifying compaction outputs issues one table cache lookup + // for both data block and range deletion block). + // May preload table cache too. + ASSERT_GE(num_table_cache_lookup, 1); + old_num_table_cache_lookup2 = num_table_cache_lookup; + // One for verifying compaction results. + // No new iterator created for compaction. + ASSERT_EQ(num_new_table_reader, 1); + + num_table_cache_lookup = 0; + num_new_table_reader = 0; + ASSERT_EQ(Key(1), Get(Key(1))); + ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 3); + ASSERT_EQ(num_new_table_reader, 0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(DBCompactionTestWithParam, CompactionDeletionTriggerReopen) { + for (int tid = 0; tid < 2; ++tid) { + uint64_t db_size[3]; + Options options = DeletionTriggerOptions(CurrentOptions()); + options.max_subcompactions = max_subcompactions_; + + if (tid == 1) { + // second pass with universal compaction + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 1; + } + + DestroyAndReopen(options); + Random rnd(301); + + // round 1 --- insert key/value pairs. + const int kTestSize = kCDTKeysPerBuffer * 512; + std::vector values; + for (int k = 0; k < kTestSize; ++k) { + values.push_back(RandomString(&rnd, kCDTValueSize)); + ASSERT_OK(Put(Key(k), values[k])); + } + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + db_size[0] = Size(Key(0), Key(kTestSize - 1)); + Close(); + + // round 2 --- disable auto-compactions and issue deletions. + options.create_if_missing = false; + options.disable_auto_compactions = true; + Reopen(options); + + for (int k = 0; k < kTestSize; ++k) { + ASSERT_OK(Delete(Key(k))); + } + db_size[1] = Size(Key(0), Key(kTestSize - 1)); + Close(); + // as auto_compaction is off, we shouldn't see too much reduce + // in db size. + ASSERT_LT(db_size[0] / 3, db_size[1]); + + // round 3 --- reopen db with auto_compaction on and see if + // deletion compensation still work. + options.disable_auto_compactions = false; + Reopen(options); + // insert relatively small amount of data to trigger auto compaction. + for (int k = 0; k < kTestSize / 10; ++k) { + ASSERT_OK(Put(Key(k), values[k])); + } + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + db_size[2] = Size(Key(0), Key(kTestSize - 1)); + // this time we're expecting significant drop in size. + ASSERT_GT(db_size[0] / 3, db_size[2]); + } +} + +TEST_F(DBCompactionTest, DisableStatsUpdateReopen) { + uint64_t db_size[3]; + for (int test = 0; test < 2; ++test) { + Options options = DeletionTriggerOptions(CurrentOptions()); + options.skip_stats_update_on_db_open = (test == 0); + + env_->random_read_counter_.Reset(); + DestroyAndReopen(options); + Random rnd(301); + + // round 1 --- insert key/value pairs. + const int kTestSize = kCDTKeysPerBuffer * 512; + std::vector values; + for (int k = 0; k < kTestSize; ++k) { + values.push_back(RandomString(&rnd, kCDTValueSize)); + ASSERT_OK(Put(Key(k), values[k])); + } + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + db_size[0] = Size(Key(0), Key(kTestSize - 1)); + Close(); + + // round 2 --- disable auto-compactions and issue deletions. + options.create_if_missing = false; + options.disable_auto_compactions = true; + + env_->random_read_counter_.Reset(); + Reopen(options); + + for (int k = 0; k < kTestSize; ++k) { + ASSERT_OK(Delete(Key(k))); + } + db_size[1] = Size(Key(0), Key(kTestSize - 1)); + Close(); + // as auto_compaction is off, we shouldn't see too much reduce + // in db size. + ASSERT_LT(db_size[0] / 3, db_size[1]); + + // round 3 --- reopen db with auto_compaction on and see if + // deletion compensation still work. + options.disable_auto_compactions = false; + Reopen(options); + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + db_size[2] = Size(Key(0), Key(kTestSize - 1)); + + if (options.skip_stats_update_on_db_open) { + // If update stats on DB::Open is disable, we don't expect + // deletion entries taking effect. + ASSERT_LT(db_size[0] / 3, db_size[2]); + } else { + // Otherwise, we should see a significant drop in db size. + ASSERT_GT(db_size[0] / 3, db_size[2]); + } + } +} + + +TEST_P(DBCompactionTestWithParam, CompactionTrigger) { + const int kNumKeysPerFile = 100; + + Options options = CurrentOptions(); + options.write_buffer_size = 110 << 10; // 110KB + options.arena_block_size = 4 << 10; + options.num_levels = 3; + options.level0_file_num_compaction_trigger = 3; + options.max_subcompactions = max_subcompactions_; + options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile)); + CreateAndReopenWithCF({"pikachu"}, options); + + Random rnd(301); + + for (int num = 0; num < options.level0_file_num_compaction_trigger - 1; + num++) { + std::vector values; + // Write 100KB (100 values, each 1K) + for (int i = 0; i < kNumKeysPerFile; i++) { + values.push_back(RandomString(&rnd, 990)); + ASSERT_OK(Put(1, Key(i), values[i])); + } + // put extra key to trigger flush + ASSERT_OK(Put(1, "", "")); + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 1); + } + + // generate one more file in level-0, and should trigger level-0 compaction + std::vector values; + for (int i = 0; i < kNumKeysPerFile; i++) { + values.push_back(RandomString(&rnd, 990)); + ASSERT_OK(Put(1, Key(i), values[i])); + } + // put extra key to trigger flush + ASSERT_OK(Put(1, "", "")); + dbfull()->TEST_WaitForCompact(); + + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + ASSERT_EQ(NumTableFilesAtLevel(1, 1), 1); +} + +TEST_F(DBCompactionTest, BGCompactionsAllowed) { + // Create several column families. Make compaction triggers in all of them + // and see number of compactions scheduled to be less than allowed. + const int kNumKeysPerFile = 100; + + Options options = CurrentOptions(); + options.write_buffer_size = 110 << 10; // 110KB + options.arena_block_size = 4 << 10; + options.num_levels = 3; + // Should speed up compaction when there are 4 files. + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 20; + options.soft_pending_compaction_bytes_limit = 1 << 30; // Infinitely large + options.max_background_compactions = 3; + options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile)); + + // Block all threads in thread pool. + const size_t kTotalTasks = 4; + env_->SetBackgroundThreads(4, Env::LOW); + test::SleepingBackgroundTask sleeping_tasks[kTotalTasks]; + for (size_t i = 0; i < kTotalTasks; i++) { + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_tasks[i], Env::Priority::LOW); + sleeping_tasks[i].WaitUntilSleeping(); + } + + CreateAndReopenWithCF({"one", "two", "three"}, options); + + Random rnd(301); + for (int cf = 0; cf < 4; cf++) { + for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) { + for (int i = 0; i < kNumKeysPerFile; i++) { + ASSERT_OK(Put(cf, Key(i), "")); + } + // put extra key to trigger flush + ASSERT_OK(Put(cf, "", "")); + dbfull()->TEST_WaitForFlushMemTable(handles_[cf]); + ASSERT_EQ(NumTableFilesAtLevel(0, cf), num + 1); + } + } + + // Now all column families qualify compaction but only one should be + // scheduled, because no column family hits speed up condition. + ASSERT_EQ(1u, env_->GetThreadPoolQueueLen(Env::Priority::LOW)); + + // Create two more files for one column family, which triggers speed up + // condition, three compactions will be scheduled. + for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) { + for (int i = 0; i < kNumKeysPerFile; i++) { + ASSERT_OK(Put(2, Key(i), "")); + } + // put extra key to trigger flush + ASSERT_OK(Put(2, "", "")); + dbfull()->TEST_WaitForFlushMemTable(handles_[2]); + ASSERT_EQ(options.level0_file_num_compaction_trigger + num + 1, + NumTableFilesAtLevel(0, 2)); + } + ASSERT_EQ(3U, env_->GetThreadPoolQueueLen(Env::Priority::LOW)); + + // Unblock all threads to unblock all compactions. + for (size_t i = 0; i < kTotalTasks; i++) { + sleeping_tasks[i].WakeUp(); + sleeping_tasks[i].WaitUntilDone(); + } + dbfull()->TEST_WaitForCompact(); + + // Verify number of compactions allowed will come back to 1. + + for (size_t i = 0; i < kTotalTasks; i++) { + sleeping_tasks[i].Reset(); + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_tasks[i], Env::Priority::LOW); + sleeping_tasks[i].WaitUntilSleeping(); + } + for (int cf = 0; cf < 4; cf++) { + for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) { + for (int i = 0; i < kNumKeysPerFile; i++) { + ASSERT_OK(Put(cf, Key(i), "")); + } + // put extra key to trigger flush + ASSERT_OK(Put(cf, "", "")); + dbfull()->TEST_WaitForFlushMemTable(handles_[cf]); + ASSERT_EQ(NumTableFilesAtLevel(0, cf), num + 1); + } + } + + // Now all column families qualify compaction but only one should be + // scheduled, because no column family hits speed up condition. + ASSERT_EQ(1U, env_->GetThreadPoolQueueLen(Env::Priority::LOW)); + + for (size_t i = 0; i < kTotalTasks; i++) { + sleeping_tasks[i].WakeUp(); + sleeping_tasks[i].WaitUntilDone(); + } +} + +TEST_P(DBCompactionTestWithParam, CompactionsGenerateMultipleFiles) { + Options options = CurrentOptions(); + options.write_buffer_size = 100000000; // Large write buffer + options.max_subcompactions = max_subcompactions_; + CreateAndReopenWithCF({"pikachu"}, options); + + Random rnd(301); + + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + std::vector values; + for (int i = 0; i < 80; i++) { + values.push_back(RandomString(&rnd, 100000)); + ASSERT_OK(Put(1, Key(i), values[i])); + } + + // Reopening moves updates to level-0 + ReopenWithColumnFamilies({"default", "pikachu"}, options); + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1], + true /* disallow trivial move */); + + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + ASSERT_GT(NumTableFilesAtLevel(1, 1), 1); + for (int i = 0; i < 80; i++) { + ASSERT_EQ(Get(1, Key(i)), values[i]); + } +} + +TEST_F(DBCompactionTest, MinorCompactionsHappen) { + do { + Options options = CurrentOptions(); + options.write_buffer_size = 10000; + CreateAndReopenWithCF({"pikachu"}, options); + + const int N = 500; + + int starting_num_tables = TotalTableFiles(1); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(1, Key(i), Key(i) + std::string(1000, 'v'))); + } + int ending_num_tables = TotalTableFiles(1); + ASSERT_GT(ending_num_tables, starting_num_tables); + + for (int i = 0; i < N; i++) { + ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(1, Key(i))); + } + + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + for (int i = 0; i < N; i++) { + ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(1, Key(i))); + } + } while (ChangeCompactOptions()); +} + +TEST_F(DBCompactionTest, UserKeyCrossFile1) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleLevel; + options.level0_file_num_compaction_trigger = 3; + + DestroyAndReopen(options); + + // create first file and flush to l0 + Put("4", "A"); + Put("3", "A"); + Flush(); + dbfull()->TEST_WaitForFlushMemTable(); + + Put("2", "A"); + Delete("3"); + Flush(); + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ("NOT_FOUND", Get("3")); + + // move both files down to l1 + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_EQ("NOT_FOUND", Get("3")); + + for (int i = 0; i < 3; i++) { + Put("2", "B"); + Flush(); + dbfull()->TEST_WaitForFlushMemTable(); + } + dbfull()->TEST_WaitForCompact(); + + ASSERT_EQ("NOT_FOUND", Get("3")); +} + +TEST_F(DBCompactionTest, UserKeyCrossFile2) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleLevel; + options.level0_file_num_compaction_trigger = 3; + + DestroyAndReopen(options); + + // create first file and flush to l0 + Put("4", "A"); + Put("3", "A"); + Flush(); + dbfull()->TEST_WaitForFlushMemTable(); + + Put("2", "A"); + SingleDelete("3"); + Flush(); + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ("NOT_FOUND", Get("3")); + + // move both files down to l1 + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_EQ("NOT_FOUND", Get("3")); + + for (int i = 0; i < 3; i++) { + Put("2", "B"); + Flush(); + dbfull()->TEST_WaitForFlushMemTable(); + } + dbfull()->TEST_WaitForCompact(); + + ASSERT_EQ("NOT_FOUND", Get("3")); +} + +TEST_F(DBCompactionTest, ZeroSeqIdCompaction) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleLevel; + options.level0_file_num_compaction_trigger = 3; + + FlushedFileCollector* collector = new FlushedFileCollector(); + options.listeners.emplace_back(collector); + + // compaction options + CompactionOptions compact_opt; + compact_opt.compression = kNoCompression; + compact_opt.output_file_size_limit = 4096; + const size_t key_len = + static_cast(compact_opt.output_file_size_limit) / 5; + + DestroyAndReopen(options); + + std::vector snaps; + + // create first file and flush to l0 + for (auto& key : {"1", "2", "3", "3", "3", "3"}) { + Put(key, std::string(key_len, 'A')); + snaps.push_back(dbfull()->GetSnapshot()); + } + Flush(); + dbfull()->TEST_WaitForFlushMemTable(); + + // create second file and flush to l0 + for (auto& key : {"3", "4", "5", "6", "7", "8"}) { + Put(key, std::string(key_len, 'A')); + snaps.push_back(dbfull()->GetSnapshot()); + } + Flush(); + dbfull()->TEST_WaitForFlushMemTable(); + + // move both files down to l1 + dbfull()->CompactFiles(compact_opt, collector->GetFlushedFiles(), 1); + + // release snap so that first instance of key(3) can have seqId=0 + for (auto snap : snaps) { + dbfull()->ReleaseSnapshot(snap); + } + + // create 3 files in l0 so to trigger compaction + for (int i = 0; i < options.level0_file_num_compaction_trigger; i++) { + Put("2", std::string(1, 'A')); + Flush(); + dbfull()->TEST_WaitForFlushMemTable(); + } + + dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Put("", "")); +} + +TEST_F(DBCompactionTest, ManualCompactionUnknownOutputSize) { + // github issue #2249 + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleLevel; + options.level0_file_num_compaction_trigger = 3; + DestroyAndReopen(options); + + // create two files in l1 that we can compact + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < options.level0_file_num_compaction_trigger; j++) { + // make l0 files' ranges overlap to avoid trivial move + Put(std::to_string(2 * i), std::string(1, 'A')); + Put(std::to_string(2 * i + 1), std::string(1, 'A')); + Flush(); + dbfull()->TEST_WaitForFlushMemTable(); + } + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1, 0), i + 1); + } + + ColumnFamilyMetaData cf_meta; + dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta); + ASSERT_EQ(2, cf_meta.levels[1].files.size()); + std::vector input_filenames; + for (const auto& sst_file : cf_meta.levels[1].files) { + input_filenames.push_back(sst_file.name); + } + + // note CompactionOptions::output_file_size_limit is unset. + CompactionOptions compact_opt; + compact_opt.compression = kNoCompression; + dbfull()->CompactFiles(compact_opt, input_filenames, 1); +} + +// Check that writes done during a memtable compaction are recovered +// if the database is shutdown during the memtable compaction. +TEST_F(DBCompactionTest, RecoverDuringMemtableCompaction) { + do { + Options options = CurrentOptions(); + options.env = env_; + CreateAndReopenWithCF({"pikachu"}, options); + + // Trigger a long memtable compaction and reopen the database during it + ASSERT_OK(Put(1, "foo", "v1")); // Goes to 1st log file + ASSERT_OK(Put(1, "big1", std::string(10000000, 'x'))); // Fills memtable + ASSERT_OK(Put(1, "big2", std::string(1000, 'y'))); // Triggers compaction + ASSERT_OK(Put(1, "bar", "v2")); // Goes to new log file + + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v2", Get(1, "bar")); + ASSERT_EQ(std::string(10000000, 'x'), Get(1, "big1")); + ASSERT_EQ(std::string(1000, 'y'), Get(1, "big2")); + } while (ChangeOptions()); +} + +TEST_P(DBCompactionTestWithParam, TrivialMoveOneFile) { + int32_t trivial_move = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* /*arg*/) { trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.write_buffer_size = 100000000; + options.max_subcompactions = max_subcompactions_; + DestroyAndReopen(options); + + int32_t num_keys = 80; + int32_t value_size = 100 * 1024; // 100 KB + + Random rnd(301); + std::vector values; + for (int i = 0; i < num_keys; i++) { + values.push_back(RandomString(&rnd, value_size)); + ASSERT_OK(Put(Key(i), values[i])); + } + + // Reopening moves updates to L0 + Reopen(options); + ASSERT_EQ(NumTableFilesAtLevel(0, 0), 1); // 1 file in L0 + ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0); // 0 files in L1 + + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(metadata.size(), 1U); + LiveFileMetaData level0_file = metadata[0]; // L0 file meta + + CompactRangeOptions cro; + cro.exclusive_manual_compaction = exclusive_manual_compaction_; + + // Compaction will initiate a trivial move from L0 to L1 + dbfull()->CompactRange(cro, nullptr, nullptr); + + // File moved From L0 to L1 + ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0); // 0 files in L0 + ASSERT_EQ(NumTableFilesAtLevel(1, 0), 1); // 1 file in L1 + + metadata.clear(); + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(metadata.size(), 1U); + ASSERT_EQ(metadata[0].name /* level1_file.name */, level0_file.name); + ASSERT_EQ(metadata[0].size /* level1_file.size */, level0_file.size); + + for (int i = 0; i < num_keys; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } + + ASSERT_EQ(trivial_move, 1); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_P(DBCompactionTestWithParam, TrivialMoveNonOverlappingFiles) { + int32_t trivial_move = 0; + int32_t non_trivial_move = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* /*arg*/) { trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial", + [&](void* /*arg*/) { non_trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.write_buffer_size = 10 * 1024 * 1024; + options.max_subcompactions = max_subcompactions_; + + DestroyAndReopen(options); + // non overlapping ranges + std::vector> ranges = { + {100, 199}, + {300, 399}, + {0, 99}, + {200, 299}, + {600, 699}, + {400, 499}, + {500, 550}, + {551, 599}, + }; + int32_t value_size = 10 * 1024; // 10 KB + + Random rnd(301); + std::map values; + for (size_t i = 0; i < ranges.size(); i++) { + for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) { + values[j] = RandomString(&rnd, value_size); + ASSERT_OK(Put(Key(j), values[j])); + } + ASSERT_OK(Flush()); + } + + int32_t level0_files = NumTableFilesAtLevel(0, 0); + ASSERT_EQ(level0_files, ranges.size()); // Multiple files in L0 + ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0); // No files in L1 + + CompactRangeOptions cro; + cro.exclusive_manual_compaction = exclusive_manual_compaction_; + + // Since data is non-overlapping we expect compaction to initiate + // a trivial move + db_->CompactRange(cro, nullptr, nullptr); + // We expect that all the files were trivially moved from L0 to L1 + ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1, 0) /* level1_files */, level0_files); + + for (size_t i = 0; i < ranges.size(); i++) { + for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) { + ASSERT_EQ(Get(Key(j)), values[j]); + } + } + + ASSERT_EQ(trivial_move, 1); + ASSERT_EQ(non_trivial_move, 0); + + trivial_move = 0; + non_trivial_move = 0; + values.clear(); + DestroyAndReopen(options); + // Same ranges as above but overlapping + ranges = { + {100, 199}, + {300, 399}, + {0, 99}, + {200, 299}, + {600, 699}, + {400, 499}, + {500, 560}, // this range overlap with the next one + {551, 599}, + }; + for (size_t i = 0; i < ranges.size(); i++) { + for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) { + values[j] = RandomString(&rnd, value_size); + ASSERT_OK(Put(Key(j), values[j])); + } + ASSERT_OK(Flush()); + } + + db_->CompactRange(cro, nullptr, nullptr); + + for (size_t i = 0; i < ranges.size(); i++) { + for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) { + ASSERT_EQ(Get(Key(j)), values[j]); + } + } + ASSERT_EQ(trivial_move, 0); + ASSERT_EQ(non_trivial_move, 1); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_P(DBCompactionTestWithParam, TrivialMoveTargetLevel) { + int32_t trivial_move = 0; + int32_t non_trivial_move = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* /*arg*/) { trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial", + [&](void* /*arg*/) { non_trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.write_buffer_size = 10 * 1024 * 1024; + options.num_levels = 7; + options.max_subcompactions = max_subcompactions_; + + DestroyAndReopen(options); + int32_t value_size = 10 * 1024; // 10 KB + + // Add 2 non-overlapping files + Random rnd(301); + std::map values; + + // file 1 [0 => 300] + for (int32_t i = 0; i <= 300; i++) { + values[i] = RandomString(&rnd, value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // file 2 [600 => 700] + for (int32_t i = 600; i <= 700; i++) { + values[i] = RandomString(&rnd, value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // 2 files in L0 + ASSERT_EQ("2", FilesPerLevel(0)); + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 6; + compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + // 2 files in L6 + ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel(0)); + + ASSERT_EQ(trivial_move, 1); + ASSERT_EQ(non_trivial_move, 0); + + for (int32_t i = 0; i <= 300; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } + for (int32_t i = 600; i <= 700; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } +} + +TEST_P(DBCompactionTestWithParam, ManualCompactionPartial) { + int32_t trivial_move = 0; + int32_t non_trivial_move = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* /*arg*/) { trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial", + [&](void* /*arg*/) { non_trivial_move++; }); + bool first = true; + // Purpose of dependencies: + // 4 -> 1: ensure the order of two non-trivial compactions + // 5 -> 2 and 5 -> 3: ensure we do a check before two non-trivial compactions + // are installed + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBCompaction::ManualPartial:4", "DBCompaction::ManualPartial:1"}, + {"DBCompaction::ManualPartial:5", "DBCompaction::ManualPartial:2"}, + {"DBCompaction::ManualPartial:5", "DBCompaction::ManualPartial:3"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) { + if (first) { + first = false; + TEST_SYNC_POINT("DBCompaction::ManualPartial:4"); + TEST_SYNC_POINT("DBCompaction::ManualPartial:3"); + } else { // second non-trivial compaction + TEST_SYNC_POINT("DBCompaction::ManualPartial:2"); + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.write_buffer_size = 10 * 1024 * 1024; + options.num_levels = 7; + options.max_subcompactions = max_subcompactions_; + options.level0_file_num_compaction_trigger = 3; + options.max_background_compactions = 3; + options.target_file_size_base = 1 << 23; // 8 MB + + DestroyAndReopen(options); + int32_t value_size = 10 * 1024; // 10 KB + + // Add 2 non-overlapping files + Random rnd(301); + std::map values; + + // file 1 [0 => 100] + for (int32_t i = 0; i < 100; i++) { + values[i] = RandomString(&rnd, value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // file 2 [100 => 300] + for (int32_t i = 100; i < 300; i++) { + values[i] = RandomString(&rnd, value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // 2 files in L0 + ASSERT_EQ("2", FilesPerLevel(0)); + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 6; + compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; + // Trivial move the two non-overlapping files to level 6 + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + // 2 files in L6 + ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel(0)); + + ASSERT_EQ(trivial_move, 1); + ASSERT_EQ(non_trivial_move, 0); + + // file 3 [ 0 => 200] + for (int32_t i = 0; i < 200; i++) { + values[i] = RandomString(&rnd, value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // 1 files in L0 + ASSERT_EQ("1,0,0,0,0,0,2", FilesPerLevel(0)); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, false)); + ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr, false)); + ASSERT_OK(dbfull()->TEST_CompactRange(2, nullptr, nullptr, nullptr, false)); + ASSERT_OK(dbfull()->TEST_CompactRange(3, nullptr, nullptr, nullptr, false)); + ASSERT_OK(dbfull()->TEST_CompactRange(4, nullptr, nullptr, nullptr, false)); + // 2 files in L6, 1 file in L5 + ASSERT_EQ("0,0,0,0,0,1,2", FilesPerLevel(0)); + + ASSERT_EQ(trivial_move, 6); + ASSERT_EQ(non_trivial_move, 0); + + ROCKSDB_NAMESPACE::port::Thread threads([&] { + compact_options.change_level = false; + compact_options.exclusive_manual_compaction = false; + std::string begin_string = Key(0); + std::string end_string = Key(199); + Slice begin(begin_string); + Slice end(end_string); + // First non-trivial compaction is triggered + ASSERT_OK(db_->CompactRange(compact_options, &begin, &end)); + }); + + TEST_SYNC_POINT("DBCompaction::ManualPartial:1"); + // file 4 [300 => 400) + for (int32_t i = 300; i <= 400; i++) { + values[i] = RandomString(&rnd, value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // file 5 [400 => 500) + for (int32_t i = 400; i <= 500; i++) { + values[i] = RandomString(&rnd, value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // file 6 [500 => 600) + for (int32_t i = 500; i <= 600; i++) { + values[i] = RandomString(&rnd, value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + // Second non-trivial compaction is triggered + ASSERT_OK(Flush()); + + // Before two non-trivial compactions are installed, there are 3 files in L0 + ASSERT_EQ("3,0,0,0,0,1,2", FilesPerLevel(0)); + TEST_SYNC_POINT("DBCompaction::ManualPartial:5"); + + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + // After two non-trivial compactions are installed, there is 1 file in L6, and + // 1 file in L1 + ASSERT_EQ("0,1,0,0,0,0,1", FilesPerLevel(0)); + threads.join(); + + for (int32_t i = 0; i < 600; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } +} + +// Disable as the test is flaky. +TEST_F(DBCompactionTest, DISABLED_ManualPartialFill) { + int32_t trivial_move = 0; + int32_t non_trivial_move = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* /*arg*/) { trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial", + [&](void* /*arg*/) { non_trivial_move++; }); + bool first = true; + bool second = true; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBCompaction::PartialFill:4", "DBCompaction::PartialFill:1"}, + {"DBCompaction::PartialFill:2", "DBCompaction::PartialFill:3"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) { + if (first) { + TEST_SYNC_POINT("DBCompaction::PartialFill:4"); + first = false; + TEST_SYNC_POINT("DBCompaction::PartialFill:3"); + } else if (second) { + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.write_buffer_size = 10 * 1024 * 1024; + options.max_bytes_for_level_multiplier = 2; + options.num_levels = 4; + options.level0_file_num_compaction_trigger = 3; + options.max_background_compactions = 3; + + DestroyAndReopen(options); + // make sure all background compaction jobs can be scheduled + auto stop_token = + dbfull()->TEST_write_controler().GetCompactionPressureToken(); + int32_t value_size = 10 * 1024; // 10 KB + + // Add 2 non-overlapping files + Random rnd(301); + std::map values; + + // file 1 [0 => 100] + for (int32_t i = 0; i < 100; i++) { + values[i] = RandomString(&rnd, value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // file 2 [100 => 300] + for (int32_t i = 100; i < 300; i++) { + values[i] = RandomString(&rnd, value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // 2 files in L0 + ASSERT_EQ("2", FilesPerLevel(0)); + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 2; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + // 2 files in L2 + ASSERT_EQ("0,0,2", FilesPerLevel(0)); + + ASSERT_EQ(trivial_move, 1); + ASSERT_EQ(non_trivial_move, 0); + + // file 3 [ 0 => 200] + for (int32_t i = 0; i < 200; i++) { + values[i] = RandomString(&rnd, value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // 2 files in L2, 1 in L0 + ASSERT_EQ("1,0,2", FilesPerLevel(0)); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, false)); + // 2 files in L2, 1 in L1 + ASSERT_EQ("0,1,2", FilesPerLevel(0)); + + ASSERT_EQ(trivial_move, 2); + ASSERT_EQ(non_trivial_move, 0); + + ROCKSDB_NAMESPACE::port::Thread threads([&] { + compact_options.change_level = false; + compact_options.exclusive_manual_compaction = false; + std::string begin_string = Key(0); + std::string end_string = Key(199); + Slice begin(begin_string); + Slice end(end_string); + ASSERT_OK(db_->CompactRange(compact_options, &begin, &end)); + }); + + TEST_SYNC_POINT("DBCompaction::PartialFill:1"); + // Many files 4 [300 => 4300) + for (int32_t i = 0; i <= 5; i++) { + for (int32_t j = 300; j < 4300; j++) { + if (j == 2300) { + ASSERT_OK(Flush()); + dbfull()->TEST_WaitForFlushMemTable(); + } + values[j] = RandomString(&rnd, value_size); + ASSERT_OK(Put(Key(j), values[j])); + } + } + + // Verify level sizes + uint64_t target_size = 4 * options.max_bytes_for_level_base; + for (int32_t i = 1; i < options.num_levels; i++) { + ASSERT_LE(SizeAtLevel(i), target_size); + target_size = static_cast(target_size * + options.max_bytes_for_level_multiplier); + } + + TEST_SYNC_POINT("DBCompaction::PartialFill:2"); + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + threads.join(); + + for (int32_t i = 0; i < 4300; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } +} + +TEST_F(DBCompactionTest, ManualCompactionWithUnorderedWrite) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::WriteImpl:UnorderedWriteAfterWriteWAL", + "DBCompactionTest::ManualCompactionWithUnorderedWrite:WaitWriteWAL"}, + {"DBImpl::WaitForPendingWrites:BeforeBlock", + "DBImpl::WriteImpl:BeforeUnorderedWriteMemtable"}}); + + Options options = CurrentOptions(); + options.unordered_write = true; + DestroyAndReopen(options); + Put("foo", "v1"); + ASSERT_OK(Flush()); + + Put("bar", "v1"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + port::Thread writer([&]() { Put("foo", "v2"); }); + + TEST_SYNC_POINT( + "DBCompactionTest::ManualCompactionWithUnorderedWrite:WaitWriteWAL"); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + writer.join(); + ASSERT_EQ(Get("foo"), "v2"); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + Reopen(options); + ASSERT_EQ(Get("foo"), "v2"); +} + +TEST_F(DBCompactionTest, DeleteFileRange) { + Options options = CurrentOptions(); + options.write_buffer_size = 10 * 1024 * 1024; + options.max_bytes_for_level_multiplier = 2; + options.num_levels = 4; + options.level0_file_num_compaction_trigger = 3; + options.max_background_compactions = 3; + + DestroyAndReopen(options); + int32_t value_size = 10 * 1024; // 10 KB + + // Add 2 non-overlapping files + Random rnd(301); + std::map values; + + // file 1 [0 => 100] + for (int32_t i = 0; i < 100; i++) { + values[i] = RandomString(&rnd, value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // file 2 [100 => 300] + for (int32_t i = 100; i < 300; i++) { + values[i] = RandomString(&rnd, value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // 2 files in L0 + ASSERT_EQ("2", FilesPerLevel(0)); + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 2; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + // 2 files in L2 + ASSERT_EQ("0,0,2", FilesPerLevel(0)); + + // file 3 [ 0 => 200] + for (int32_t i = 0; i < 200; i++) { + values[i] = RandomString(&rnd, value_size); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + // Many files 4 [300 => 4300) + for (int32_t i = 0; i <= 5; i++) { + for (int32_t j = 300; j < 4300; j++) { + if (j == 2300) { + ASSERT_OK(Flush()); + dbfull()->TEST_WaitForFlushMemTable(); + } + values[j] = RandomString(&rnd, value_size); + ASSERT_OK(Put(Key(j), values[j])); + } + } + ASSERT_OK(Flush()); + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + + // Verify level sizes + uint64_t target_size = 4 * options.max_bytes_for_level_base; + for (int32_t i = 1; i < options.num_levels; i++) { + ASSERT_LE(SizeAtLevel(i), target_size); + target_size = static_cast(target_size * + options.max_bytes_for_level_multiplier); + } + + size_t old_num_files = CountFiles(); + std::string begin_string = Key(1000); + std::string end_string = Key(2000); + Slice begin(begin_string); + Slice end(end_string); + ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end)); + + int32_t deleted_count = 0; + for (int32_t i = 0; i < 4300; i++) { + if (i < 1000 || i > 2000) { + ASSERT_EQ(Get(Key(i)), values[i]); + } else { + ReadOptions roptions; + std::string result; + Status s = db_->Get(roptions, Key(i), &result); + ASSERT_TRUE(s.IsNotFound() || s.ok()); + if (s.IsNotFound()) { + deleted_count++; + } + } + } + ASSERT_GT(deleted_count, 0); + begin_string = Key(5000); + end_string = Key(6000); + Slice begin1(begin_string); + Slice end1(end_string); + // Try deleting files in range which contain no keys + ASSERT_OK( + DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin1, &end1)); + + // Push data from level 0 to level 1 to force all data to be deleted + // Note that we don't delete level 0 files + compact_options.change_level = true; + compact_options.target_level = 1; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + dbfull()->TEST_WaitForCompact(); + + ASSERT_OK( + DeleteFilesInRange(db_, db_->DefaultColumnFamily(), nullptr, nullptr)); + + int32_t deleted_count2 = 0; + for (int32_t i = 0; i < 4300; i++) { + ReadOptions roptions; + std::string result; + Status s = db_->Get(roptions, Key(i), &result); + ASSERT_TRUE(s.IsNotFound()); + deleted_count2++; + } + ASSERT_GT(deleted_count2, deleted_count); + size_t new_num_files = CountFiles(); + ASSERT_GT(old_num_files, new_num_files); +} + +TEST_F(DBCompactionTest, DeleteFilesInRanges) { + Options options = CurrentOptions(); + options.write_buffer_size = 10 * 1024 * 1024; + options.max_bytes_for_level_multiplier = 2; + options.num_levels = 4; + options.max_background_compactions = 3; + options.disable_auto_compactions = true; + + DestroyAndReopen(options); + int32_t value_size = 10 * 1024; // 10 KB + + Random rnd(301); + std::map values; + + // file [0 => 100), [100 => 200), ... [900, 1000) + for (auto i = 0; i < 10; i++) { + for (auto j = 0; j < 100; j++) { + auto k = i * 100 + j; + values[k] = RandomString(&rnd, value_size); + ASSERT_OK(Put(Key(k), values[k])); + } + ASSERT_OK(Flush()); + } + ASSERT_EQ("10", FilesPerLevel(0)); + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 2; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + ASSERT_EQ("0,0,10", FilesPerLevel(0)); + + // file [0 => 100), [200 => 300), ... [800, 900) + for (auto i = 0; i < 10; i+=2) { + for (auto j = 0; j < 100; j++) { + auto k = i * 100 + j; + ASSERT_OK(Put(Key(k), values[k])); + } + ASSERT_OK(Flush()); + } + ASSERT_EQ("5,0,10", FilesPerLevel(0)); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr)); + ASSERT_EQ("0,5,10", FilesPerLevel(0)); + + // Delete files in range [0, 299] (inclusive) + { + auto begin_str1 = Key(0), end_str1 = Key(100); + auto begin_str2 = Key(100), end_str2 = Key(200); + auto begin_str3 = Key(200), end_str3 = Key(299); + Slice begin1(begin_str1), end1(end_str1); + Slice begin2(begin_str2), end2(end_str2); + Slice begin3(begin_str3), end3(end_str3); + std::vector ranges; + ranges.push_back(RangePtr(&begin1, &end1)); + ranges.push_back(RangePtr(&begin2, &end2)); + ranges.push_back(RangePtr(&begin3, &end3)); + ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(), + ranges.data(), ranges.size())); + ASSERT_EQ("0,3,7", FilesPerLevel(0)); + + // Keys [0, 300) should not exist. + for (auto i = 0; i < 300; i++) { + ReadOptions ropts; + std::string result; + auto s = db_->Get(ropts, Key(i), &result); + ASSERT_TRUE(s.IsNotFound()); + } + for (auto i = 300; i < 1000; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } + } + + // Delete files in range [600, 999) (exclusive) + { + auto begin_str1 = Key(600), end_str1 = Key(800); + auto begin_str2 = Key(700), end_str2 = Key(900); + auto begin_str3 = Key(800), end_str3 = Key(999); + Slice begin1(begin_str1), end1(end_str1); + Slice begin2(begin_str2), end2(end_str2); + Slice begin3(begin_str3), end3(end_str3); + std::vector ranges; + ranges.push_back(RangePtr(&begin1, &end1)); + ranges.push_back(RangePtr(&begin2, &end2)); + ranges.push_back(RangePtr(&begin3, &end3)); + ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(), + ranges.data(), ranges.size(), false)); + ASSERT_EQ("0,1,4", FilesPerLevel(0)); + + // Keys [600, 900) should not exist. + for (auto i = 600; i < 900; i++) { + ReadOptions ropts; + std::string result; + auto s = db_->Get(ropts, Key(i), &result); + ASSERT_TRUE(s.IsNotFound()); + } + for (auto i = 300; i < 600; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } + for (auto i = 900; i < 1000; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } + } + + // Delete all files. + { + RangePtr range; + ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(), &range, 1)); + ASSERT_EQ("", FilesPerLevel(0)); + + for (auto i = 0; i < 1000; i++) { + ReadOptions ropts; + std::string result; + auto s = db_->Get(ropts, Key(i), &result); + ASSERT_TRUE(s.IsNotFound()); + } + } +} + +TEST_F(DBCompactionTest, DeleteFileRangeFileEndpointsOverlapBug) { + // regression test for #2833: groups of files whose user-keys overlap at the + // endpoints could be split by `DeleteFilesInRange`. This caused old data to + // reappear, either because a new version of the key was removed, or a range + // deletion was partially dropped. It could also cause non-overlapping + // invariant to be violated if the files dropped by DeleteFilesInRange were + // a subset of files that a range deletion spans. + const int kNumL0Files = 2; + const int kValSize = 8 << 10; // 8KB + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files; + options.target_file_size_base = 1 << 10; // 1KB + DestroyAndReopen(options); + + // The snapshot prevents key 1 from having its old version dropped. The low + // `target_file_size_base` ensures two keys will be in each output file. + const Snapshot* snapshot = nullptr; + Random rnd(301); + // The value indicates which flush the key belonged to, which is enough + // for us to determine the keys' relative ages. After L0 flushes finish, + // files look like: + // + // File 0: 0 -> vals[0], 1 -> vals[0] + // File 1: 1 -> vals[1], 2 -> vals[1] + // + // Then L0->L1 compaction happens, which outputs keys as follows: + // + // File 0: 0 -> vals[0], 1 -> vals[1] + // File 1: 1 -> vals[0], 2 -> vals[1] + // + // DeleteFilesInRange shouldn't be allowed to drop just file 0, as that + // would cause `1 -> vals[0]` (an older key) to reappear. + std::string vals[kNumL0Files]; + for (int i = 0; i < kNumL0Files; ++i) { + vals[i] = RandomString(&rnd, kValSize); + Put(Key(i), vals[i]); + Put(Key(i + 1), vals[i]); + Flush(); + if (i == 0) { + snapshot = db_->GetSnapshot(); + } + } + dbfull()->TEST_WaitForCompact(); + + // Verify `DeleteFilesInRange` can't drop only file 0 which would cause + // "1 -> vals[0]" to reappear. + std::string begin_str = Key(0), end_str = Key(1); + Slice begin = begin_str, end = end_str; + ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end)); + ASSERT_EQ(vals[1], Get(Key(1))); + + db_->ReleaseSnapshot(snapshot); +} + +TEST_P(DBCompactionTestWithParam, TrivialMoveToLastLevelWithFiles) { + int32_t trivial_move = 0; + int32_t non_trivial_move = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* /*arg*/) { trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial", + [&](void* /*arg*/) { non_trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.write_buffer_size = 100000000; + options.max_subcompactions = max_subcompactions_; + DestroyAndReopen(options); + + int32_t value_size = 10 * 1024; // 10 KB + + Random rnd(301); + std::vector values; + // File with keys [ 0 => 99 ] + for (int i = 0; i < 100; i++) { + values.push_back(RandomString(&rnd, value_size)); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + ASSERT_EQ("1", FilesPerLevel(0)); + // Compaction will do L0=>L1 (trivial move) then move L1 files to L3 + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 3; + compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + ASSERT_EQ("0,0,0,1", FilesPerLevel(0)); + ASSERT_EQ(trivial_move, 1); + ASSERT_EQ(non_trivial_move, 0); + + // File with keys [ 100 => 199 ] + for (int i = 100; i < 200; i++) { + values.push_back(RandomString(&rnd, value_size)); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Flush()); + + ASSERT_EQ("1,0,0,1", FilesPerLevel(0)); + CompactRangeOptions cro; + cro.exclusive_manual_compaction = exclusive_manual_compaction_; + // Compaction will do L0=>L1 L1=>L2 L2=>L3 (3 trivial moves) + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,2", FilesPerLevel(0)); + ASSERT_EQ(trivial_move, 4); + ASSERT_EQ(non_trivial_move, 0); + + for (int i = 0; i < 200; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_P(DBCompactionTestWithParam, LevelCompactionThirdPath) { + Options options = CurrentOptions(); + options.db_paths.emplace_back(dbname_, 500 * 1024); + options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024); + options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024); + options.memtable_factory.reset( + new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + options.compaction_style = kCompactionStyleLevel; + options.write_buffer_size = 110 << 10; // 110KB + options.arena_block_size = 4 << 10; + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 4; + options.max_bytes_for_level_base = 400 * 1024; + options.max_subcompactions = max_subcompactions_; + // options = CurrentOptions(options); + + std::vector filenames; + env_->GetChildren(options.db_paths[1].path, &filenames); + // Delete archival files. + for (size_t i = 0; i < filenames.size(); ++i) { + env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]); + } + env_->DeleteDir(options.db_paths[1].path); + Reopen(options); + + Random rnd(301); + int key_idx = 0; + + // First three 110KB files are not going to second path. + // After that, (100K, 200K) + for (int num = 0; num < 3; num++) { + GenerateNewFile(&rnd, &key_idx); + } + + // Another 110KB triggers a compaction to 400K file to fill up first path + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(3, GetSstFileCount(options.db_paths[1].path)); + + // (1, 4) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4", FilesPerLevel(0)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 4, 1) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,1", FilesPerLevel(0)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 4, 2) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,2", FilesPerLevel(0)); + ASSERT_EQ(2, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 4, 3) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,3", FilesPerLevel(0)); + ASSERT_EQ(3, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 4, 4) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,4", FilesPerLevel(0)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 4, 5) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,5", FilesPerLevel(0)); + ASSERT_EQ(5, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 4, 6) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,6", FilesPerLevel(0)); + ASSERT_EQ(6, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 4, 7) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,7", FilesPerLevel(0)); + ASSERT_EQ(7, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 4, 8) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,8", FilesPerLevel(0)); + ASSERT_EQ(8, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + for (int i = 0; i < key_idx; i++) { + auto v = Get(Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + + Reopen(options); + + for (int i = 0; i < key_idx; i++) { + auto v = Get(Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + + Destroy(options); +} + +TEST_P(DBCompactionTestWithParam, LevelCompactionPathUse) { + Options options = CurrentOptions(); + options.db_paths.emplace_back(dbname_, 500 * 1024); + options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024); + options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024); + options.memtable_factory.reset( + new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + options.compaction_style = kCompactionStyleLevel; + options.write_buffer_size = 110 << 10; // 110KB + options.arena_block_size = 4 << 10; + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 4; + options.max_bytes_for_level_base = 400 * 1024; + options.max_subcompactions = max_subcompactions_; + // options = CurrentOptions(options); + + std::vector filenames; + env_->GetChildren(options.db_paths[1].path, &filenames); + // Delete archival files. + for (size_t i = 0; i < filenames.size(); ++i) { + env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]); + } + env_->DeleteDir(options.db_paths[1].path); + Reopen(options); + + Random rnd(301); + int key_idx = 0; + + // Always gets compacted into 1 Level1 file, + // 0/1 Level 0 file + for (int num = 0; num < 3; num++) { + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + } + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,1", FilesPerLevel(0)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("0,1", FilesPerLevel(0)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,1", FilesPerLevel(0)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("0,1", FilesPerLevel(0)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,1", FilesPerLevel(0)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("0,1", FilesPerLevel(0)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,1", FilesPerLevel(0)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("0,1", FilesPerLevel(0)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,1", FilesPerLevel(0)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + for (int i = 0; i < key_idx; i++) { + auto v = Get(Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + + Reopen(options); + + for (int i = 0; i < key_idx; i++) { + auto v = Get(Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + + Destroy(options); +} + +TEST_P(DBCompactionTestWithParam, LevelCompactionCFPathUse) { + Options options = CurrentOptions(); + options.db_paths.emplace_back(dbname_, 500 * 1024); + options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024); + options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024); + options.memtable_factory.reset( + new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + options.compaction_style = kCompactionStyleLevel; + options.write_buffer_size = 110 << 10; // 110KB + options.arena_block_size = 4 << 10; + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 4; + options.max_bytes_for_level_base = 400 * 1024; + options.max_subcompactions = max_subcompactions_; + + std::vector option_vector; + option_vector.emplace_back(options); + ColumnFamilyOptions cf_opt1(options), cf_opt2(options); + // Configure CF1 specific paths. + cf_opt1.cf_paths.emplace_back(dbname_ + "cf1", 500 * 1024); + cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_2", 4 * 1024 * 1024); + cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_3", 1024 * 1024 * 1024); + option_vector.emplace_back(DBOptions(options), cf_opt1); + CreateColumnFamilies({"one"},option_vector[1]); + + // Configura CF2 specific paths. + cf_opt2.cf_paths.emplace_back(dbname_ + "cf2", 500 * 1024); + cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_2", 4 * 1024 * 1024); + cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_3", 1024 * 1024 * 1024); + option_vector.emplace_back(DBOptions(options), cf_opt2); + CreateColumnFamilies({"two"},option_vector[2]); + + ReopenWithColumnFamilies({"default", "one", "two"}, option_vector); + + Random rnd(301); + int key_idx = 0; + int key_idx1 = 0; + int key_idx2 = 0; + + auto generate_file = [&]() { + GenerateNewFile(0, &rnd, &key_idx); + GenerateNewFile(1, &rnd, &key_idx1); + GenerateNewFile(2, &rnd, &key_idx2); + }; + + auto check_sstfilecount = [&](int path_id, int expected) { + ASSERT_EQ(expected, GetSstFileCount(options.db_paths[path_id].path)); + ASSERT_EQ(expected, GetSstFileCount(cf_opt1.cf_paths[path_id].path)); + ASSERT_EQ(expected, GetSstFileCount(cf_opt2.cf_paths[path_id].path)); + }; + + auto check_filesperlevel = [&](const std::string& expected) { + ASSERT_EQ(expected, FilesPerLevel(0)); + ASSERT_EQ(expected, FilesPerLevel(1)); + ASSERT_EQ(expected, FilesPerLevel(2)); + }; + + auto check_getvalues = [&]() { + for (int i = 0; i < key_idx; i++) { + auto v = Get(0, Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + + for (int i = 0; i < key_idx1; i++) { + auto v = Get(1, Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + + for (int i = 0; i < key_idx2; i++) { + auto v = Get(2, Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + }; + + // Check that default column family uses db_paths. + // And Column family "one" uses cf_paths. + + // First three 110KB files are not going to second path. + // After that, (100K, 200K) + for (int num = 0; num < 3; num++) { + generate_file(); + } + + // Another 110KB triggers a compaction to 400K file to fill up first path + generate_file(); + check_sstfilecount(1, 3); + + // (1, 4) + generate_file(); + check_filesperlevel("1,4"); + check_sstfilecount(1, 4); + check_sstfilecount(0, 1); + + // (1, 4, 1) + generate_file(); + check_filesperlevel("1,4,1"); + check_sstfilecount(2, 1); + check_sstfilecount(1, 4); + check_sstfilecount(0, 1); + + // (1, 4, 2) + generate_file(); + check_filesperlevel("1,4,2"); + check_sstfilecount(2, 2); + check_sstfilecount(1, 4); + check_sstfilecount(0, 1); + + check_getvalues(); + + ReopenWithColumnFamilies({"default", "one", "two"}, option_vector); + + check_getvalues(); + + Destroy(options, true); +} + +TEST_P(DBCompactionTestWithParam, ConvertCompactionStyle) { + Random rnd(301); + int max_key_level_insert = 200; + int max_key_universal_insert = 600; + + // Stage 1: generate a db with level compaction + Options options = CurrentOptions(); + options.write_buffer_size = 110 << 10; // 110KB + options.arena_block_size = 4 << 10; + options.num_levels = 4; + options.level0_file_num_compaction_trigger = 3; + options.max_bytes_for_level_base = 500 << 10; // 500KB + options.max_bytes_for_level_multiplier = 1; + options.target_file_size_base = 200 << 10; // 200KB + options.target_file_size_multiplier = 1; + options.max_subcompactions = max_subcompactions_; + CreateAndReopenWithCF({"pikachu"}, options); + + for (int i = 0; i <= max_key_level_insert; i++) { + // each value is 10K + ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000))); + } + ASSERT_OK(Flush(1)); + dbfull()->TEST_WaitForCompact(); + + ASSERT_GT(TotalTableFiles(1, 4), 1); + int non_level0_num_files = 0; + for (int i = 1; i < options.num_levels; i++) { + non_level0_num_files += NumTableFilesAtLevel(i, 1); + } + ASSERT_GT(non_level0_num_files, 0); + + // Stage 2: reopen with universal compaction - should fail + options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 1; + options = CurrentOptions(options); + Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_TRUE(s.IsInvalidArgument()); + + // Stage 3: compact into a single file and move the file to level 0 + options = CurrentOptions(); + options.disable_auto_compactions = true; + options.target_file_size_base = INT_MAX; + options.target_file_size_multiplier = 1; + options.max_bytes_for_level_base = INT_MAX; + options.max_bytes_for_level_multiplier = 1; + options.num_levels = 4; + options = CurrentOptions(options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 0; + // cannot use kForceOptimized here because the compaction here is expected + // to generate one output file + compact_options.bottommost_level_compaction = + BottommostLevelCompaction::kForce; + compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; + dbfull()->CompactRange(compact_options, handles_[1], nullptr, nullptr); + + // Only 1 file in L0 + ASSERT_EQ("1", FilesPerLevel(1)); + + // Stage 4: re-open in universal compaction style and do some db operations + options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 4; + options.write_buffer_size = 110 << 10; // 110KB + options.arena_block_size = 4 << 10; + options.level0_file_num_compaction_trigger = 3; + options = CurrentOptions(options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + options.num_levels = 1; + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + for (int i = max_key_level_insert / 2; i <= max_key_universal_insert; i++) { + ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000))); + } + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Flush(1)); + dbfull()->TEST_WaitForCompact(); + + for (int i = 1; i < options.num_levels; i++) { + ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0); + } + + // verify keys inserted in both level compaction style and universal + // compaction style + std::string keys_in_db; + Iterator* iter = dbfull()->NewIterator(ReadOptions(), handles_[1]); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + keys_in_db.append(iter->key().ToString()); + keys_in_db.push_back(','); + } + delete iter; + + std::string expected_keys; + for (int i = 0; i <= max_key_universal_insert; i++) { + expected_keys.append(Key(i)); + expected_keys.push_back(','); + } + + ASSERT_EQ(keys_in_db, expected_keys); +} + +TEST_F(DBCompactionTest, L0_CompactionBug_Issue44_a) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "b", "v")); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_OK(Delete(1, "b")); + ASSERT_OK(Delete(1, "a")); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_OK(Delete(1, "a")); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "a", "v")); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_EQ("(a->v)", Contents(1)); + env_->SleepForMicroseconds(1000000); // Wait for compaction to finish + ASSERT_EQ("(a->v)", Contents(1)); + } while (ChangeCompactOptions()); +} + +TEST_F(DBCompactionTest, L0_CompactionBug_Issue44_b) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + Put(1, "", ""); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + Delete(1, "e"); + Put(1, "", ""); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + Put(1, "c", "cv"); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + Put(1, "", ""); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + Put(1, "", ""); + env_->SleepForMicroseconds(1000000); // Wait for compaction to finish + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + Put(1, "d", "dv"); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + Put(1, "", ""); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + Delete(1, "d"); + Delete(1, "b"); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_EQ("(->)(c->cv)", Contents(1)); + env_->SleepForMicroseconds(1000000); // Wait for compaction to finish + ASSERT_EQ("(->)(c->cv)", Contents(1)); + } while (ChangeCompactOptions()); +} + +TEST_F(DBCompactionTest, ManualAutoRace) { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BGWorkCompaction", "DBCompactionTest::ManualAutoRace:1"}, + {"DBImpl::RunManualCompaction:WaitScheduled", + "BackgroundCallCompaction:0"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Put(1, "foo", ""); + Put(1, "bar", ""); + Flush(1); + Put(1, "foo", ""); + Put(1, "bar", ""); + // Generate four files in CF 0, which should trigger an auto compaction + Put("foo", ""); + Put("bar", ""); + Flush(); + Put("foo", ""); + Put("bar", ""); + Flush(); + Put("foo", ""); + Put("bar", ""); + Flush(); + Put("foo", ""); + Put("bar", ""); + Flush(); + + // The auto compaction is scheduled but waited until here + TEST_SYNC_POINT("DBCompactionTest::ManualAutoRace:1"); + // The auto compaction will wait until the manual compaction is registerd + // before processing so that it will be cancelled. + dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr); + ASSERT_EQ("0,1", FilesPerLevel(1)); + + // Eventually the cancelled compaction will be rescheduled and executed. + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ("0,1", FilesPerLevel(0)); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_P(DBCompactionTestWithParam, ManualCompaction) { + Options options = CurrentOptions(); + options.max_subcompactions = max_subcompactions_; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + CreateAndReopenWithCF({"pikachu"}, options); + + // iter - 0 with 7 levels + // iter - 1 with 3 levels + for (int iter = 0; iter < 2; ++iter) { + MakeTables(3, "p", "q", 1); + ASSERT_EQ("1,1,1", FilesPerLevel(1)); + + // Compaction range falls before files + Compact(1, "", "c"); + ASSERT_EQ("1,1,1", FilesPerLevel(1)); + + // Compaction range falls after files + Compact(1, "r", "z"); + ASSERT_EQ("1,1,1", FilesPerLevel(1)); + + // Compaction range overlaps files + Compact(1, "p1", "p9"); + ASSERT_EQ("0,0,1", FilesPerLevel(1)); + + // Populate a different range + MakeTables(3, "c", "e", 1); + ASSERT_EQ("1,1,2", FilesPerLevel(1)); + + // Compact just the new range + Compact(1, "b", "f"); + ASSERT_EQ("0,0,2", FilesPerLevel(1)); + + // Compact all + MakeTables(1, "a", "z", 1); + ASSERT_EQ("1,0,2", FilesPerLevel(1)); + + uint64_t prev_block_cache_add = + options.statistics->getTickerCount(BLOCK_CACHE_ADD); + CompactRangeOptions cro; + cro.exclusive_manual_compaction = exclusive_manual_compaction_; + db_->CompactRange(cro, handles_[1], nullptr, nullptr); + // Verify manual compaction doesn't fill block cache + ASSERT_EQ(prev_block_cache_add, + options.statistics->getTickerCount(BLOCK_CACHE_ADD)); + + ASSERT_EQ("0,0,1", FilesPerLevel(1)); + + if (iter == 0) { + options = CurrentOptions(); + options.num_levels = 3; + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + } + } +} + + +TEST_P(DBCompactionTestWithParam, ManualLevelCompactionOutputPathId) { + Options options = CurrentOptions(); + options.db_paths.emplace_back(dbname_ + "_2", 2 * 10485760); + options.db_paths.emplace_back(dbname_ + "_3", 100 * 10485760); + options.db_paths.emplace_back(dbname_ + "_4", 120 * 10485760); + options.max_subcompactions = max_subcompactions_; + CreateAndReopenWithCF({"pikachu"}, options); + + // iter - 0 with 7 levels + // iter - 1 with 3 levels + for (int iter = 0; iter < 2; ++iter) { + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put(1, "p", "begin")); + ASSERT_OK(Put(1, "q", "end")); + ASSERT_OK(Flush(1)); + } + ASSERT_EQ("3", FilesPerLevel(1)); + ASSERT_EQ(3, GetSstFileCount(options.db_paths[0].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + // Compaction range falls before files + Compact(1, "", "c"); + ASSERT_EQ("3", FilesPerLevel(1)); + + // Compaction range falls after files + Compact(1, "r", "z"); + ASSERT_EQ("3", FilesPerLevel(1)); + + // Compaction range overlaps files + Compact(1, "p1", "p9", 1); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("0,1", FilesPerLevel(1)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + // Populate a different range + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put(1, "c", "begin")); + ASSERT_OK(Put(1, "e", "end")); + ASSERT_OK(Flush(1)); + } + ASSERT_EQ("3,1", FilesPerLevel(1)); + + // Compact just the new range + Compact(1, "b", "f", 1); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("0,2", FilesPerLevel(1)); + ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + // Compact all + ASSERT_OK(Put(1, "a", "begin")); + ASSERT_OK(Put(1, "z", "end")); + ASSERT_OK(Flush(1)); + ASSERT_EQ("1,2", FilesPerLevel(1)); + ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path)); + CompactRangeOptions compact_options; + compact_options.target_path_id = 1; + compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; + db_->CompactRange(compact_options, handles_[1], nullptr, nullptr); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ("0,1", FilesPerLevel(1)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + if (iter == 0) { + DestroyAndReopen(options); + options = CurrentOptions(); + options.db_paths.emplace_back(dbname_ + "_2", 2 * 10485760); + options.db_paths.emplace_back(dbname_ + "_3", 100 * 10485760); + options.db_paths.emplace_back(dbname_ + "_4", 120 * 10485760); + options.max_background_flushes = 1; + options.num_levels = 3; + options.create_if_missing = true; + CreateAndReopenWithCF({"pikachu"}, options); + } + } +} + +TEST_F(DBCompactionTest, FilesDeletedAfterCompaction) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "foo", "v2")); + Compact(1, "a", "z"); + const size_t num_files = CountLiveFiles(); + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put(1, "foo", "v2")); + Compact(1, "a", "z"); + } + ASSERT_EQ(CountLiveFiles(), num_files); + } while (ChangeCompactOptions()); +} + +// Check level comapction with compact files +TEST_P(DBCompactionTestWithParam, DISABLED_CompactFilesOnLevelCompaction) { + const int kTestKeySize = 16; + const int kTestValueSize = 984; + const int kEntrySize = kTestKeySize + kTestValueSize; + const int kEntriesPerBuffer = 100; + Options options; + options.create_if_missing = true; + options.write_buffer_size = kEntrySize * kEntriesPerBuffer; + options.compaction_style = kCompactionStyleLevel; + options.target_file_size_base = options.write_buffer_size; + options.max_bytes_for_level_base = options.target_file_size_base * 2; + options.level0_stop_writes_trigger = 2; + options.max_bytes_for_level_multiplier = 2; + options.compression = kNoCompression; + options.max_subcompactions = max_subcompactions_; + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, options); + + Random rnd(301); + for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) { + ASSERT_OK(Put(1, ToString(key), RandomString(&rnd, kTestValueSize))); + } + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + dbfull()->TEST_WaitForCompact(); + + ColumnFamilyMetaData cf_meta; + dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta); + int output_level = static_cast(cf_meta.levels.size()) - 1; + for (int file_picked = 5; file_picked > 0; --file_picked) { + std::set overlapping_file_names; + std::vector compaction_input_file_names; + for (int f = 0; f < file_picked; ++f) { + int level = 0; + auto file_meta = PickFileRandomly(cf_meta, &rnd, &level); + compaction_input_file_names.push_back(file_meta->name); + GetOverlappingFileNumbersForLevelCompaction( + cf_meta, options.comparator, level, output_level, + file_meta, &overlapping_file_names); + } + + ASSERT_OK(dbfull()->CompactFiles( + CompactionOptions(), handles_[1], + compaction_input_file_names, + output_level)); + + // Make sure all overlapping files do not exist after compaction + dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta); + VerifyCompactionResult(cf_meta, overlapping_file_names); + } + + // make sure all key-values are still there. + for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) { + ASSERT_NE(Get(1, ToString(key)), "NOT_FOUND"); + } +} + +TEST_P(DBCompactionTestWithParam, PartialCompactionFailure) { + Options options; + const int kKeySize = 16; + const int kKvSize = 1000; + const int kKeysPerBuffer = 100; + const int kNumL1Files = 5; + options.create_if_missing = true; + options.write_buffer_size = kKeysPerBuffer * kKvSize; + options.max_write_buffer_number = 2; + options.target_file_size_base = + options.write_buffer_size * + (options.max_write_buffer_number - 1); + options.level0_file_num_compaction_trigger = kNumL1Files; + options.max_bytes_for_level_base = + options.level0_file_num_compaction_trigger * + options.target_file_size_base; + options.max_bytes_for_level_multiplier = 2; + options.compression = kNoCompression; + options.max_subcompactions = max_subcompactions_; + + env_->SetBackgroundThreads(1, Env::HIGH); + env_->SetBackgroundThreads(1, Env::LOW); + // stop the compaction thread until we simulate the file creation failure. + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + + options.env = env_; + + DestroyAndReopen(options); + + const int kNumInsertedKeys = + options.level0_file_num_compaction_trigger * + (options.max_write_buffer_number - 1) * + kKeysPerBuffer; + + Random rnd(301); + std::vector keys; + std::vector values; + for (int k = 0; k < kNumInsertedKeys; ++k) { + keys.emplace_back(RandomString(&rnd, kKeySize)); + values.emplace_back(RandomString(&rnd, kKvSize - kKeySize)); + ASSERT_OK(Put(Slice(keys[k]), Slice(values[k]))); + dbfull()->TEST_WaitForFlushMemTable(); + } + + dbfull()->TEST_FlushMemTable(true); + // Make sure the number of L0 files can trigger compaction. + ASSERT_GE(NumTableFilesAtLevel(0), + options.level0_file_num_compaction_trigger); + + auto previous_num_level0_files = NumTableFilesAtLevel(0); + + // Fail the first file creation. + env_->non_writable_count_ = 1; + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); + + // Expect compaction to fail here as one file will fail its + // creation. + ASSERT_TRUE(!dbfull()->TEST_WaitForCompact().ok()); + + // Verify L0 -> L1 compaction does fail. + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + + // Verify all L0 files are still there. + ASSERT_EQ(NumTableFilesAtLevel(0), previous_num_level0_files); + + // All key-values must exist after compaction fails. + for (int k = 0; k < kNumInsertedKeys; ++k) { + ASSERT_EQ(values[k], Get(keys[k])); + } + + env_->non_writable_count_ = 0; + + // Make sure RocksDB will not get into corrupted state. + Reopen(options); + + // Verify again after reopen. + for (int k = 0; k < kNumInsertedKeys; ++k) { + ASSERT_EQ(values[k], Get(keys[k])); + } +} + +TEST_P(DBCompactionTestWithParam, DeleteMovedFileAfterCompaction) { + // iter 1 -- delete_obsolete_files_period_micros == 0 + for (int iter = 0; iter < 2; ++iter) { + // This test triggers move compaction and verifies that the file is not + // deleted when it's part of move compaction + Options options = CurrentOptions(); + options.env = env_; + if (iter == 1) { + options.delete_obsolete_files_period_micros = 0; + } + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = + 2; // trigger compaction when we have 2 files + OnFileDeletionListener* listener = new OnFileDeletionListener(); + options.listeners.emplace_back(listener); + options.max_subcompactions = max_subcompactions_; + DestroyAndReopen(options); + + Random rnd(301); + // Create two 1MB sst files + for (int i = 0; i < 2; ++i) { + // Create 1MB sst file + for (int j = 0; j < 100; ++j) { + ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024))); + } + ASSERT_OK(Flush()); + } + // this should execute L0->L1 + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ("0,1", FilesPerLevel(0)); + + // block compactions + test::SleepingBackgroundTask sleeping_task; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task, + Env::Priority::LOW); + + options.max_bytes_for_level_base = 1024 * 1024; // 1 MB + Reopen(options); + std::unique_ptr iterator(db_->NewIterator(ReadOptions())); + ASSERT_EQ("0,1", FilesPerLevel(0)); + // let compactions go + sleeping_task.WakeUp(); + sleeping_task.WaitUntilDone(); + + // this should execute L1->L2 (move) + dbfull()->TEST_WaitForCompact(); + + ASSERT_EQ("0,0,1", FilesPerLevel(0)); + + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(metadata.size(), 1U); + auto moved_file_name = metadata[0].name; + + // Create two more 1MB sst files + for (int i = 0; i < 2; ++i) { + // Create 1MB sst file + for (int j = 0; j < 100; ++j) { + ASSERT_OK(Put(Key(i * 50 + j + 100), RandomString(&rnd, 10 * 1024))); + } + ASSERT_OK(Flush()); + } + // this should execute both L0->L1 and L1->L2 (merge with previous file) + dbfull()->TEST_WaitForCompact(); + + ASSERT_EQ("0,0,2", FilesPerLevel(0)); + + // iterator is holding the file + ASSERT_OK(env_->FileExists(dbname_ + moved_file_name)); + + listener->SetExpectedFileName(dbname_ + moved_file_name); + iterator.reset(); + + // this file should have been compacted away + ASSERT_NOK(env_->FileExists(dbname_ + moved_file_name)); + listener->VerifyMatchedCount(1); + } +} + +TEST_P(DBCompactionTestWithParam, CompressLevelCompaction) { + if (!Zlib_Supported()) { + return; + } + Options options = CurrentOptions(); + options.memtable_factory.reset( + new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + options.compaction_style = kCompactionStyleLevel; + options.write_buffer_size = 110 << 10; // 110KB + options.arena_block_size = 4 << 10; + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 4; + options.max_bytes_for_level_base = 400 * 1024; + options.max_subcompactions = max_subcompactions_; + // First two levels have no compression, so that a trivial move between + // them will be allowed. Level 2 has Zlib compression so that a trivial + // move to level 3 will not be allowed + options.compression_per_level = {kNoCompression, kNoCompression, + kZlibCompression}; + int matches = 0, didnt_match = 0, trivial_move = 0, non_trivial = 0; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "Compaction::InputCompressionMatchesOutput:Matches", + [&](void* /*arg*/) { matches++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "Compaction::InputCompressionMatchesOutput:DidntMatch", + [&](void* /*arg*/) { didnt_match++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial", + [&](void* /*arg*/) { non_trivial++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* /*arg*/) { trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Reopen(options); + + Random rnd(301); + int key_idx = 0; + + // First three 110KB files are going to level 0 + // After that, (100K, 200K) + for (int num = 0; num < 3; num++) { + GenerateNewFile(&rnd, &key_idx); + } + + // Another 110KB triggers a compaction to 400K file to fill up level 0 + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(4, GetSstFileCount(dbname_)); + + // (1, 4) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4", FilesPerLevel(0)); + + // (1, 4, 1) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,1", FilesPerLevel(0)); + + // (1, 4, 2) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,2", FilesPerLevel(0)); + + // (1, 4, 3) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,3", FilesPerLevel(0)); + + // (1, 4, 4) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,4", FilesPerLevel(0)); + + // (1, 4, 5) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,5", FilesPerLevel(0)); + + // (1, 4, 6) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,6", FilesPerLevel(0)); + + // (1, 4, 7) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,7", FilesPerLevel(0)); + + // (1, 4, 8) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,8", FilesPerLevel(0)); + + ASSERT_EQ(matches, 12); + // Currently, the test relies on the number of calls to + // InputCompressionMatchesOutput() per compaction. + const int kCallsToInputCompressionMatch = 2; + ASSERT_EQ(didnt_match, 8 * kCallsToInputCompressionMatch); + ASSERT_EQ(trivial_move, 12); + ASSERT_EQ(non_trivial, 8); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + for (int i = 0; i < key_idx; i++) { + auto v = Get(Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + + Reopen(options); + + for (int i = 0; i < key_idx; i++) { + auto v = Get(Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + + Destroy(options); +} + +TEST_F(DBCompactionTest, SanitizeCompactionOptionsTest) { + Options options = CurrentOptions(); + options.max_background_compactions = 5; + options.soft_pending_compaction_bytes_limit = 0; + options.hard_pending_compaction_bytes_limit = 100; + options.create_if_missing = true; + DestroyAndReopen(options); + ASSERT_EQ(100, db_->GetOptions().soft_pending_compaction_bytes_limit); + + options.max_background_compactions = 3; + options.soft_pending_compaction_bytes_limit = 200; + options.hard_pending_compaction_bytes_limit = 150; + DestroyAndReopen(options); + ASSERT_EQ(150, db_->GetOptions().soft_pending_compaction_bytes_limit); +} + +// This tests for a bug that could cause two level0 compactions running +// concurrently +// TODO(aekmekji): Make sure that the reason this fails when run with +// max_subcompactions > 1 is not a correctness issue but just inherent to +// running parallel L0-L1 compactions +TEST_F(DBCompactionTest, SuggestCompactRangeNoTwoLevel0Compactions) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleLevel; + options.write_buffer_size = 110 << 10; + options.arena_block_size = 4 << 10; + options.level0_file_num_compaction_trigger = 4; + options.num_levels = 4; + options.compression = kNoCompression; + options.max_bytes_for_level_base = 450 << 10; + options.target_file_size_base = 98 << 10; + options.max_write_buffer_number = 2; + options.max_background_compactions = 2; + + DestroyAndReopen(options); + + // fill up the DB + Random rnd(301); + for (int num = 0; num < 10; num++) { + GenerateNewRandomFile(&rnd); + } + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"CompactionJob::Run():Start", + "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:1"}, + {"DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:2", + "CompactionJob::Run():End"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // trigger L0 compaction + for (int num = 0; num < options.level0_file_num_compaction_trigger + 1; + num++) { + GenerateNewRandomFile(&rnd, /* nowait */ true); + ASSERT_OK(Flush()); + } + + TEST_SYNC_POINT( + "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:1"); + + GenerateNewRandomFile(&rnd, /* nowait */ true); + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr)); + for (int num = 0; num < options.level0_file_num_compaction_trigger + 1; + num++) { + GenerateNewRandomFile(&rnd, /* nowait */ true); + ASSERT_OK(Flush()); + } + + TEST_SYNC_POINT( + "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:2"); + dbfull()->TEST_WaitForCompact(); +} + +static std::string ShortKey(int i) { + assert(i < 10000); + char buf[100]; + snprintf(buf, sizeof(buf), "key%04d", i); + return std::string(buf); +} + +TEST_P(DBCompactionTestWithParam, ForceBottommostLevelCompaction) { + int32_t trivial_move = 0; + int32_t non_trivial_move = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* /*arg*/) { trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial", + [&](void* /*arg*/) { non_trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // The key size is guaranteed to be <= 8 + class ShortKeyComparator : public Comparator { + int Compare(const ROCKSDB_NAMESPACE::Slice& a, + const ROCKSDB_NAMESPACE::Slice& b) const override { + assert(a.size() <= 8); + assert(b.size() <= 8); + return BytewiseComparator()->Compare(a, b); + } + const char* Name() const override { return "ShortKeyComparator"; } + void FindShortestSeparator( + std::string* start, + const ROCKSDB_NAMESPACE::Slice& limit) const override { + return BytewiseComparator()->FindShortestSeparator(start, limit); + } + void FindShortSuccessor(std::string* key) const override { + return BytewiseComparator()->FindShortSuccessor(key); + } + } short_key_cmp; + Options options = CurrentOptions(); + options.target_file_size_base = 100000000; + options.write_buffer_size = 100000000; + options.max_subcompactions = max_subcompactions_; + options.comparator = &short_key_cmp; + DestroyAndReopen(options); + + int32_t value_size = 10 * 1024; // 10 KB + + Random rnd(301); + std::vector values; + // File with keys [ 0 => 99 ] + for (int i = 0; i < 100; i++) { + values.push_back(RandomString(&rnd, value_size)); + ASSERT_OK(Put(ShortKey(i), values[i])); + } + ASSERT_OK(Flush()); + + ASSERT_EQ("1", FilesPerLevel(0)); + // Compaction will do L0=>L1 (trivial move) then move L1 files to L3 + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 3; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + ASSERT_EQ("0,0,0,1", FilesPerLevel(0)); + ASSERT_EQ(trivial_move, 1); + ASSERT_EQ(non_trivial_move, 0); + + // File with keys [ 100 => 199 ] + for (int i = 100; i < 200; i++) { + values.push_back(RandomString(&rnd, value_size)); + ASSERT_OK(Put(ShortKey(i), values[i])); + } + ASSERT_OK(Flush()); + + ASSERT_EQ("1,0,0,1", FilesPerLevel(0)); + // Compaction will do L0=>L1 L1=>L2 L2=>L3 (3 trivial moves) + // then compacte the bottommost level L3=>L3 (non trivial move) + compact_options = CompactRangeOptions(); + compact_options.bottommost_level_compaction = + BottommostLevelCompaction::kForceOptimized; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + ASSERT_EQ("0,0,0,1", FilesPerLevel(0)); + ASSERT_EQ(trivial_move, 4); + ASSERT_EQ(non_trivial_move, 1); + + // File with keys [ 200 => 299 ] + for (int i = 200; i < 300; i++) { + values.push_back(RandomString(&rnd, value_size)); + ASSERT_OK(Put(ShortKey(i), values[i])); + } + ASSERT_OK(Flush()); + + ASSERT_EQ("1,0,0,1", FilesPerLevel(0)); + trivial_move = 0; + non_trivial_move = 0; + compact_options = CompactRangeOptions(); + compact_options.bottommost_level_compaction = + BottommostLevelCompaction::kSkip; + // Compaction will do L0=>L1 L1=>L2 L2=>L3 (3 trivial moves) + // and will skip bottommost level compaction + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + ASSERT_EQ("0,0,0,2", FilesPerLevel(0)); + ASSERT_EQ(trivial_move, 3); + ASSERT_EQ(non_trivial_move, 0); + + for (int i = 0; i < 300; i++) { + ASSERT_EQ(Get(ShortKey(i)), values[i]); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_P(DBCompactionTestWithParam, IntraL0Compaction) { + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = 5; + options.max_background_compactions = 2; + options.max_subcompactions = max_subcompactions_; + DestroyAndReopen(options); + + const size_t kValueSize = 1 << 20; + Random rnd(301); + std::string value(RandomString(&rnd, kValueSize)); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"LevelCompactionPicker::PickCompactionBySize:0", + "CompactionJob::Run():Start"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // index: 0 1 2 3 4 5 6 7 8 9 + // size: 1MB 1MB 1MB 1MB 1MB 2MB 1MB 1MB 1MB 1MB + // score: 1.5 1.3 1.5 2.0 inf + // + // Files 0-4 will be included in an L0->L1 compaction. + // + // L0->L0 will be triggered since the sync points guarantee compaction to base + // level is still blocked when files 5-9 trigger another compaction. + // + // Files 6-9 are the longest span of available files for which + // work-per-deleted-file decreases (see "score" row above). + for (int i = 0; i < 10; ++i) { + ASSERT_OK(Put(Key(0), "")); // prevents trivial move + if (i == 5) { + ASSERT_OK(Put(Key(i + 1), value + value)); + } else { + ASSERT_OK(Put(Key(i + 1), value)); + } + ASSERT_OK(Flush()); + } + dbfull()->TEST_WaitForCompact(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + std::vector> level_to_files; + dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), + &level_to_files); + ASSERT_GE(level_to_files.size(), 2); // at least L0 and L1 + // L0 has the 2MB file (not compacted) and 4MB file (output of L0->L0) + ASSERT_EQ(2, level_to_files[0].size()); + ASSERT_GT(level_to_files[1].size(), 0); + for (int i = 0; i < 2; ++i) { + ASSERT_GE(level_to_files[0][i].fd.file_size, 1 << 21); + } +} + +TEST_P(DBCompactionTestWithParam, IntraL0CompactionDoesNotObsoleteDeletions) { + // regression test for issue #2722: L0->L0 compaction can resurrect deleted + // keys from older L0 files if L1+ files' key-ranges do not include the key. + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = 5; + options.max_background_compactions = 2; + options.max_subcompactions = max_subcompactions_; + DestroyAndReopen(options); + + const size_t kValueSize = 1 << 20; + Random rnd(301); + std::string value(RandomString(&rnd, kValueSize)); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"LevelCompactionPicker::PickCompactionBySize:0", + "CompactionJob::Run():Start"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // index: 0 1 2 3 4 5 6 7 8 9 + // size: 1MB 1MB 1MB 1MB 1MB 1MB 1MB 1MB 1MB 1MB + // score: 1.25 1.33 1.5 2.0 inf + // + // Files 0-4 will be included in an L0->L1 compaction. + // + // L0->L0 will be triggered since the sync points guarantee compaction to base + // level is still blocked when files 5-9 trigger another compaction. All files + // 5-9 are included in the L0->L0 due to work-per-deleted file decreasing. + // + // Put a key-value in files 0-4. Delete that key in files 5-9. Verify the + // L0->L0 preserves the deletion such that the key remains deleted. + for (int i = 0; i < 10; ++i) { + // key 0 serves both to prevent trivial move and as the key we want to + // verify is not resurrected by L0->L0 compaction. + if (i < 5) { + ASSERT_OK(Put(Key(0), "")); + } else { + ASSERT_OK(Delete(Key(0))); + } + ASSERT_OK(Put(Key(i + 1), value)); + ASSERT_OK(Flush()); + } + dbfull()->TEST_WaitForCompact(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + std::vector> level_to_files; + dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), + &level_to_files); + ASSERT_GE(level_to_files.size(), 2); // at least L0 and L1 + // L0 has a single output file from L0->L0 + ASSERT_EQ(1, level_to_files[0].size()); + ASSERT_GT(level_to_files[1].size(), 0); + ASSERT_GE(level_to_files[0][0].fd.file_size, 1 << 22); + + ReadOptions roptions; + std::string result; + ASSERT_TRUE(db_->Get(roptions, Key(0), &result).IsNotFound()); +} + +TEST_P(DBCompactionTestWithParam, FullCompactionInBottomPriThreadPool) { + const int kNumFilesTrigger = 3; + Env::Default()->SetBackgroundThreads(1, Env::Priority::BOTTOM); + for (bool use_universal_compaction : {false, true}) { + Options options = CurrentOptions(); + if (use_universal_compaction) { + options.compaction_style = kCompactionStyleUniversal; + } else { + options.compaction_style = kCompactionStyleLevel; + options.level_compaction_dynamic_level_bytes = true; + } + options.num_levels = 4; + options.write_buffer_size = 100 << 10; // 100KB + options.target_file_size_base = 32 << 10; // 32KB + options.level0_file_num_compaction_trigger = kNumFilesTrigger; + // Trigger compaction if size amplification exceeds 110% + options.compaction_options_universal.max_size_amplification_percent = 110; + DestroyAndReopen(options); + + int num_bottom_pri_compactions = 0; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BGWorkBottomCompaction", + [&](void* /*arg*/) { ++num_bottom_pri_compactions; }); + SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + for (int num = 0; num < kNumFilesTrigger; num++) { + ASSERT_EQ(NumSortedRuns(), num); + int key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + } + dbfull()->TEST_WaitForCompact(); + + ASSERT_EQ(1, num_bottom_pri_compactions); + + // Verify that size amplification did occur + ASSERT_EQ(NumSortedRuns(), 1); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } + Env::Default()->SetBackgroundThreads(0, Env::Priority::BOTTOM); +} + +TEST_F(DBCompactionTest, OptimizedDeletionObsoleting) { + // Deletions can be dropped when compacted to non-last level if they fall + // outside the lower-level files' key-ranges. + const int kNumL0Files = 4; + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + DestroyAndReopen(options); + + // put key 1 and 3 in separate L1, L2 files. + // So key 0, 2, and 4+ fall outside these levels' key-ranges. + for (int level = 2; level >= 1; --level) { + for (int i = 0; i < 2; ++i) { + Put(Key(2 * i + 1), "val"); + Flush(); + } + MoveFilesToLevel(level); + ASSERT_EQ(2, NumTableFilesAtLevel(level)); + } + + // Delete keys in range [1, 4]. These L0 files will be compacted with L1: + // - Tombstones for keys 2 and 4 can be dropped early. + // - Tombstones for keys 1 and 3 must be kept due to L2 files' key-ranges. + for (int i = 0; i < kNumL0Files; ++i) { + Put(Key(0), "val"); // sentinel to prevent trivial move + Delete(Key(i + 1)); + Flush(); + } + dbfull()->TEST_WaitForCompact(); + + for (int i = 0; i < kNumL0Files; ++i) { + std::string value; + ASSERT_TRUE(db_->Get(ReadOptions(), Key(i + 1), &value).IsNotFound()); + } + ASSERT_EQ(2, options.statistics->getTickerCount( + COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE)); + ASSERT_EQ(2, + options.statistics->getTickerCount(COMPACTION_KEY_DROP_OBSOLETE)); +} + +TEST_F(DBCompactionTest, CompactFilesPendingL0Bug) { + // https://www.facebook.com/groups/rocksdb.dev/permalink/1389452781153232/ + // CompactFiles() had a bug where it failed to pick a compaction when an L0 + // compaction existed, but marked it as scheduled anyways. It'd never be + // unmarked as scheduled, so future compactions or DB close could hang. + const int kNumL0Files = 5; + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files - 1; + options.max_background_compactions = 2; + DestroyAndReopen(options); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"LevelCompactionPicker::PickCompaction:Return", + "DBCompactionTest::CompactFilesPendingL0Bug:Picked"}, + {"DBCompactionTest::CompactFilesPendingL0Bug:ManualCompacted", + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + auto schedule_multi_compaction_token = + dbfull()->TEST_write_controler().GetCompactionPressureToken(); + + // Files 0-3 will be included in an L0->L1 compaction. + // + // File 4 will be included in a call to CompactFiles() while the first + // compaction is running. + for (int i = 0; i < kNumL0Files - 1; ++i) { + ASSERT_OK(Put(Key(0), "val")); // sentinel to prevent trivial move + ASSERT_OK(Put(Key(i + 1), "val")); + ASSERT_OK(Flush()); + } + TEST_SYNC_POINT("DBCompactionTest::CompactFilesPendingL0Bug:Picked"); + // file 4 flushed after 0-3 picked + ASSERT_OK(Put(Key(kNumL0Files), "val")); + ASSERT_OK(Flush()); + + // previously DB close would hang forever as this situation caused scheduled + // compactions count to never decrement to zero. + ColumnFamilyMetaData cf_meta; + dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta); + ASSERT_EQ(kNumL0Files, cf_meta.levels[0].files.size()); + std::vector input_filenames; + input_filenames.push_back(cf_meta.levels[0].files.front().name); + ASSERT_OK(dbfull() + ->CompactFiles(CompactionOptions(), input_filenames, + 0 /* output_level */)); + TEST_SYNC_POINT("DBCompactionTest::CompactFilesPendingL0Bug:ManualCompacted"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBCompactionTest, CompactFilesOverlapInL0Bug) { + // Regression test for bug of not pulling in L0 files that overlap the user- + // specified input files in time- and key-ranges. + Put(Key(0), "old_val"); + Flush(); + Put(Key(0), "new_val"); + Flush(); + + ColumnFamilyMetaData cf_meta; + dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta); + ASSERT_GE(cf_meta.levels.size(), 2); + ASSERT_EQ(2, cf_meta.levels[0].files.size()); + + // Compacting {new L0 file, L1 file} should pull in the old L0 file since it + // overlaps in key-range and time-range. + std::vector input_filenames; + input_filenames.push_back(cf_meta.levels[0].files.front().name); + ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), input_filenames, + 1 /* output_level */)); + ASSERT_EQ("new_val", Get(Key(0))); +} + +TEST_F(DBCompactionTest, CompactBottomLevelFilesWithDeletions) { + // bottom-level files may contain deletions due to snapshots protecting the + // deleted keys. Once the snapshot is released, we should see files with many + // such deletions undergo single-file compactions. + const int kNumKeysPerFile = 1024; + const int kNumLevelFiles = 4; + const int kValueSize = 128; + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = kNumLevelFiles; + // inflate it a bit to account for key/metadata overhead + options.target_file_size_base = 120 * kNumKeysPerFile * kValueSize / 100; + CreateAndReopenWithCF({"one"}, options); + + Random rnd(301); + const Snapshot* snapshot = nullptr; + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK( + Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize))); + } + if (i == kNumLevelFiles - 1) { + snapshot = db_->GetSnapshot(); + // delete every other key after grabbing a snapshot, so these deletions + // and the keys they cover can't be dropped until after the snapshot is + // released. + for (int j = 0; j < kNumLevelFiles * kNumKeysPerFile; j += 2) { + ASSERT_OK(Delete(Key(j))); + } + } + Flush(); + if (i < kNumLevelFiles - 1) { + ASSERT_EQ(i + 1, NumTableFilesAtLevel(0)); + } + } + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(kNumLevelFiles, NumTableFilesAtLevel(1)); + + std::vector pre_release_metadata, post_release_metadata; + db_->GetLiveFilesMetaData(&pre_release_metadata); + // just need to bump seqnum so ReleaseSnapshot knows the newest key in the SST + // files does not need to be preserved in case of a future snapshot. + ASSERT_OK(Put(Key(0), "val")); + ASSERT_NE(kMaxSequenceNumber, dbfull()->bottommost_files_mark_threshold_); + // release snapshot and wait for compactions to finish. Single-file + // compactions should be triggered, which reduce the size of each bottom-level + // file without changing file count. + db_->ReleaseSnapshot(snapshot); + ASSERT_EQ(kMaxSequenceNumber, dbfull()->bottommost_files_mark_threshold_); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + ASSERT_TRUE(compaction->compaction_reason() == + CompactionReason::kBottommostFiles); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + dbfull()->TEST_WaitForCompact(); + db_->GetLiveFilesMetaData(&post_release_metadata); + ASSERT_EQ(pre_release_metadata.size(), post_release_metadata.size()); + + for (size_t i = 0; i < pre_release_metadata.size(); ++i) { + const auto& pre_file = pre_release_metadata[i]; + const auto& post_file = post_release_metadata[i]; + ASSERT_EQ(1, pre_file.level); + ASSERT_EQ(1, post_file.level); + // each file is smaller than it was before as it was rewritten without + // deletion markers/deleted keys. + ASSERT_LT(post_file.size, pre_file.size); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBCompactionTest, LevelCompactExpiredTtlFiles) { + const int kNumKeysPerFile = 32; + const int kNumLevelFiles = 2; + const int kValueSize = 1024; + + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.ttl = 24 * 60 * 60; // 24 hours + options.max_open_files = -1; + env_->time_elapse_only_sleep_ = false; + options.env = env_; + + env_->addon_time_.store(0); + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK( + Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize))); + } + Flush(); + } + dbfull()->TEST_WaitForCompact(); + MoveFilesToLevel(3); + ASSERT_EQ("0,0,0,2", FilesPerLevel()); + + // Delete previously written keys. + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK(Delete(Key(i * kNumKeysPerFile + j))); + } + Flush(); + } + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ("2,0,0,2", FilesPerLevel()); + MoveFilesToLevel(1); + ASSERT_EQ("0,2,0,2", FilesPerLevel()); + + env_->addon_time_.fetch_add(36 * 60 * 60); // 36 hours + ASSERT_EQ("0,2,0,2", FilesPerLevel()); + + // Just do a simple write + flush so that the Ttl expired files get + // compacted. + ASSERT_OK(Put("a", "1")); + Flush(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + ASSERT_TRUE(compaction->compaction_reason() == CompactionReason::kTtl); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + dbfull()->TEST_WaitForCompact(); + // All non-L0 files are deleted, as they contained only deleted data. + ASSERT_EQ("1", FilesPerLevel()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + // Test dynamically changing ttl. + + env_->addon_time_.store(0); + DestroyAndReopen(options); + + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK( + Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize))); + } + Flush(); + } + dbfull()->TEST_WaitForCompact(); + MoveFilesToLevel(3); + ASSERT_EQ("0,0,0,2", FilesPerLevel()); + + // Delete previously written keys. + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK(Delete(Key(i * kNumKeysPerFile + j))); + } + Flush(); + } + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ("2,0,0,2", FilesPerLevel()); + MoveFilesToLevel(1); + ASSERT_EQ("0,2,0,2", FilesPerLevel()); + + // Move time forward by 12 hours, and make sure that compaction still doesn't + // trigger as ttl is set to 24 hours. + env_->addon_time_.fetch_add(12 * 60 * 60); + ASSERT_OK(Put("a", "1")); + Flush(); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ("1,2,0,2", FilesPerLevel()); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + ASSERT_TRUE(compaction->compaction_reason() == CompactionReason::kTtl); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Dynamically change ttl to 10 hours. + // This should trigger a ttl compaction, as 12 hours have already passed. + ASSERT_OK(dbfull()->SetOptions({{"ttl", "36000"}})); + dbfull()->TEST_WaitForCompact(); + // All non-L0 files are deleted, as they contained only deleted data. + ASSERT_EQ("1", FilesPerLevel()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBCompactionTest, LevelTtlCascadingCompactions) { + const int kValueSize = 100; + + for (bool if_restart : {false, true}) { + for (bool if_open_all_files : {false, true}) { + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.ttl = 24 * 60 * 60; // 24 hours + if (if_open_all_files) { + options.max_open_files = -1; + } else { + options.max_open_files = 20; + } + // RocksDB sanitize max open files to at least 20. Modify it back. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) { + int* max_open_files = static_cast(arg); + *max_open_files = 2; + }); + // In the case where all files are opened and doing DB restart + // forcing the oldest ancester time in manifest file to be 0 to + // simulate the case of reading from an old version. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "VersionEdit::EncodeTo:VarintOldestAncesterTime", [&](void* arg) { + if (if_restart && if_open_all_files) { + std::string* encoded_fieled = static_cast(arg); + *encoded_fieled = ""; + PutVarint64(encoded_fieled, 0); + } + }); + + env_->time_elapse_only_sleep_ = false; + options.env = env_; + + env_->addon_time_.store(0); + DestroyAndReopen(options); + + int ttl_compactions = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + auto compaction_reason = compaction->compaction_reason(); + if (compaction_reason == CompactionReason::kTtl) { + ttl_compactions++; + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Add two L6 files with key ranges: [1 .. 100], [101 .. 200]. + Random rnd(301); + for (int i = 1; i <= 100; ++i) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize))); + } + Flush(); + // Get the first file's creation time. This will be the oldest file in the + // DB. Compactions inolving this file's descendents should keep getting + // this time. + std::vector> level_to_files; + dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), + &level_to_files); + uint64_t oldest_time = level_to_files[0][0].oldest_ancester_time; + // Add 1 hour and do another flush. + env_->addon_time_.fetch_add(1 * 60 * 60); + for (int i = 101; i <= 200; ++i) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize))); + } + Flush(); + MoveFilesToLevel(6); + ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel()); + + env_->addon_time_.fetch_add(1 * 60 * 60); + // Add two L4 files with key ranges: [1 .. 50], [51 .. 150]. + for (int i = 1; i <= 50; ++i) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize))); + } + Flush(); + env_->addon_time_.fetch_add(1 * 60 * 60); + for (int i = 51; i <= 150; ++i) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize))); + } + Flush(); + MoveFilesToLevel(4); + ASSERT_EQ("0,0,0,0,2,0,2", FilesPerLevel()); + + env_->addon_time_.fetch_add(1 * 60 * 60); + // Add one L1 file with key range: [26, 75]. + for (int i = 26; i <= 75; ++i) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize))); + } + Flush(); + dbfull()->TEST_WaitForCompact(); + MoveFilesToLevel(1); + ASSERT_EQ("0,1,0,0,2,0,2", FilesPerLevel()); + + // LSM tree: + // L1: [26 .. 75] + // L4: [1 .. 50][51 ..... 150] + // L6: [1 ........ 100][101 .... 200] + // + // On TTL expiry, TTL compaction should be initiated on L1 file, and the + // compactions should keep going on until the key range hits bottom level. + // In other words: the compaction on this data range "cascasdes" until + // reaching the bottom level. + // + // Order of events on TTL expiry: + // 1. L1 file falls to L3 via 2 trivial moves which are initiated by the + // ttl + // compaction. + // 2. A TTL compaction happens between L3 and L4 files. Output file in L4. + // 3. The new output file from L4 falls to L5 via 1 trival move initiated + // by the ttl compaction. + // 4. A TTL compaction happens between L5 and L6 files. Ouptut in L6. + + // Add 25 hours and do a write + env_->addon_time_.fetch_add(25 * 60 * 60); + + ASSERT_OK(Put(Key(1), "1")); + if (if_restart) { + Reopen(options); + } else { + Flush(); + } + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ("1,0,0,0,0,0,1", FilesPerLevel()); + ASSERT_EQ(5, ttl_compactions); + + dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), + &level_to_files); + ASSERT_EQ(oldest_time, level_to_files[6][0].oldest_ancester_time); + + env_->addon_time_.fetch_add(25 * 60 * 60); + ASSERT_OK(Put(Key(2), "1")); + if (if_restart) { + Reopen(options); + } else { + Flush(); + } + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ("1,0,0,0,0,0,1", FilesPerLevel()); + ASSERT_GE(ttl_compactions, 6); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } + } +} + +TEST_F(DBCompactionTest, LevelPeriodicCompaction) { + const int kNumKeysPerFile = 32; + const int kNumLevelFiles = 2; + const int kValueSize = 100; + + for (bool if_restart : {false, true}) { + for (bool if_open_all_files : {false, true}) { + Options options = CurrentOptions(); + options.periodic_compaction_seconds = 48 * 60 * 60; // 2 days + if (if_open_all_files) { + options.max_open_files = -1; // needed for ttl compaction + } else { + options.max_open_files = 20; + } + // RocksDB sanitize max open files to at least 20. Modify it back. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) { + int* max_open_files = static_cast(arg); + *max_open_files = 0; + }); + // In the case where all files are opened and doing DB restart + // forcing the file creation time in manifest file to be 0 to + // simulate the case of reading from an old version. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "VersionEdit::EncodeTo:VarintFileCreationTime", [&](void* arg) { + if (if_restart && if_open_all_files) { + std::string* encoded_fieled = static_cast(arg); + *encoded_fieled = ""; + PutVarint64(encoded_fieled, 0); + } + }); + + env_->time_elapse_only_sleep_ = false; + options.env = env_; + + env_->addon_time_.store(0); + DestroyAndReopen(options); + + int periodic_compactions = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + auto compaction_reason = compaction->compaction_reason(); + if (compaction_reason == CompactionReason::kPeriodicCompaction) { + periodic_compactions++; + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK(Put(Key(i * kNumKeysPerFile + j), + RandomString(&rnd, kValueSize))); + } + Flush(); + } + dbfull()->TEST_WaitForCompact(); + + ASSERT_EQ("2", FilesPerLevel()); + ASSERT_EQ(0, periodic_compactions); + + // Add 50 hours and do a write + env_->addon_time_.fetch_add(50 * 60 * 60); + ASSERT_OK(Put("a", "1")); + Flush(); + dbfull()->TEST_WaitForCompact(); + // Assert that the files stay in the same level + ASSERT_EQ("3", FilesPerLevel()); + // The two old files go through the periodic compaction process + ASSERT_EQ(2, periodic_compactions); + + MoveFilesToLevel(1); + ASSERT_EQ("0,3", FilesPerLevel()); + + // Add another 50 hours and do another write + env_->addon_time_.fetch_add(50 * 60 * 60); + ASSERT_OK(Put("b", "2")); + if (if_restart) { + Reopen(options); + } else { + Flush(); + } + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ("1,3", FilesPerLevel()); + // The three old files now go through the periodic compaction process. 2 + // + 3. + ASSERT_EQ(5, periodic_compactions); + + // Add another 50 hours and do another write + env_->addon_time_.fetch_add(50 * 60 * 60); + ASSERT_OK(Put("c", "3")); + Flush(); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ("2,3", FilesPerLevel()); + // The four old files now go through the periodic compaction process. 5 + // + 4. + ASSERT_EQ(9, periodic_compactions); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } + } +} + +TEST_F(DBCompactionTest, LevelPeriodicCompactionWithOldDB) { + // This test makes sure that periodic compactions are working with a DB + // where file_creation_time of some files is 0. + // After compactions the new files are created with a valid file_creation_time + + const int kNumKeysPerFile = 32; + const int kNumFiles = 4; + const int kValueSize = 100; + + Options options = CurrentOptions(); + env_->time_elapse_only_sleep_ = false; + options.env = env_; + + env_->addon_time_.store(0); + DestroyAndReopen(options); + + int periodic_compactions = 0; + bool set_file_creation_time_to_zero = true; + bool set_creation_time_to_zero = true; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + auto compaction_reason = compaction->compaction_reason(); + if (compaction_reason == CompactionReason::kPeriodicCompaction) { + periodic_compactions++; + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "PropertyBlockBuilder::AddTableProperty:Start", [&](void* arg) { + TableProperties* props = reinterpret_cast(arg); + if (set_file_creation_time_to_zero) { + props->file_creation_time = 0; + } + if (set_creation_time_to_zero) { + props->creation_time = 0; + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + for (int i = 0; i < kNumFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK( + Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize))); + } + Flush(); + // Move the first two files to L2. + if (i == 1) { + MoveFilesToLevel(2); + set_creation_time_to_zero = false; + } + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ("2,0,2", FilesPerLevel()); + ASSERT_EQ(0, periodic_compactions); + + Close(); + + set_file_creation_time_to_zero = false; + // Forward the clock by 2 days. + env_->addon_time_.fetch_add(2 * 24 * 60 * 60); + options.periodic_compaction_seconds = 1 * 24 * 60 * 60; // 1 day + + Reopen(options); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("2,0,2", FilesPerLevel()); + // Make sure that all files go through periodic compaction. + ASSERT_EQ(kNumFiles, periodic_compactions); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBCompactionTest, LevelPeriodicAndTtlCompaction) { + const int kNumKeysPerFile = 32; + const int kNumLevelFiles = 2; + const int kValueSize = 100; + + Options options = CurrentOptions(); + options.ttl = 10 * 60 * 60; // 10 hours + options.periodic_compaction_seconds = 48 * 60 * 60; // 2 days + options.max_open_files = -1; // needed for both periodic and ttl compactions + env_->time_elapse_only_sleep_ = false; + options.env = env_; + + env_->addon_time_.store(0); + DestroyAndReopen(options); + + int periodic_compactions = 0; + int ttl_compactions = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + auto compaction_reason = compaction->compaction_reason(); + if (compaction_reason == CompactionReason::kPeriodicCompaction) { + periodic_compactions++; + } else if (compaction_reason == CompactionReason::kTtl) { + ttl_compactions++; + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK( + Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize))); + } + Flush(); + } + dbfull()->TEST_WaitForCompact(); + + MoveFilesToLevel(3); + + ASSERT_EQ("0,0,0,2", FilesPerLevel()); + ASSERT_EQ(0, periodic_compactions); + ASSERT_EQ(0, ttl_compactions); + + // Add some time greater than periodic_compaction_time. + env_->addon_time_.fetch_add(50 * 60 * 60); + ASSERT_OK(Put("a", "1")); + Flush(); + dbfull()->TEST_WaitForCompact(); + // Files in the bottom level go through periodic compactions. + ASSERT_EQ("1,0,0,2", FilesPerLevel()); + ASSERT_EQ(2, periodic_compactions); + ASSERT_EQ(0, ttl_compactions); + + // Add a little more time than ttl + env_->addon_time_.fetch_add(11 * 60 * 60); + ASSERT_OK(Put("b", "1")); + Flush(); + dbfull()->TEST_WaitForCompact(); + // Notice that the previous file in level 1 falls down to the bottom level + // due to ttl compactions, one level at a time. + // And bottom level files don't get picked up for ttl compactions. + ASSERT_EQ("1,0,0,3", FilesPerLevel()); + ASSERT_EQ(2, periodic_compactions); + ASSERT_EQ(3, ttl_compactions); + + // Add some time greater than periodic_compaction_time. + env_->addon_time_.fetch_add(50 * 60 * 60); + ASSERT_OK(Put("c", "1")); + Flush(); + dbfull()->TEST_WaitForCompact(); + // Previous L0 file falls one level at a time to bottom level due to ttl. + // And all 4 bottom files go through periodic compactions. + ASSERT_EQ("1,0,0,4", FilesPerLevel()); + ASSERT_EQ(6, periodic_compactions); + ASSERT_EQ(6, ttl_compactions); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBCompactionTest, LevelPeriodicCompactionWithCompactionFilters) { + class TestCompactionFilter : public CompactionFilter { + const char* Name() const override { return "TestCompactionFilter"; } + }; + class TestCompactionFilterFactory : public CompactionFilterFactory { + const char* Name() const override { return "TestCompactionFilterFactory"; } + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& /*context*/) override { + return std::unique_ptr(new TestCompactionFilter()); + } + }; + + const int kNumKeysPerFile = 32; + const int kNumLevelFiles = 2; + const int kValueSize = 100; + + Random rnd(301); + + Options options = CurrentOptions(); + TestCompactionFilter test_compaction_filter; + env_->time_elapse_only_sleep_ = false; + options.env = env_; + env_->addon_time_.store(0); + + enum CompactionFilterType { + kUseCompactionFilter, + kUseCompactionFilterFactory + }; + + for (CompactionFilterType comp_filter_type : + {kUseCompactionFilter, kUseCompactionFilterFactory}) { + // Assert that periodic compactions are not enabled. + ASSERT_EQ(port::kMaxUint64 - 1, options.periodic_compaction_seconds); + + if (comp_filter_type == kUseCompactionFilter) { + options.compaction_filter = &test_compaction_filter; + options.compaction_filter_factory.reset(); + } else if (comp_filter_type == kUseCompactionFilterFactory) { + options.compaction_filter = nullptr; + options.compaction_filter_factory.reset( + new TestCompactionFilterFactory()); + } + DestroyAndReopen(options); + + // periodic_compaction_seconds should be set to the sanitized value when + // a compaction filter or a compaction filter factory is used. + ASSERT_EQ(30 * 24 * 60 * 60, + dbfull()->GetOptions().periodic_compaction_seconds); + + int periodic_compactions = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + auto compaction_reason = compaction->compaction_reason(); + if (compaction_reason == CompactionReason::kPeriodicCompaction) { + periodic_compactions++; + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK( + Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize))); + } + Flush(); + } + dbfull()->TEST_WaitForCompact(); + + ASSERT_EQ("2", FilesPerLevel()); + ASSERT_EQ(0, periodic_compactions); + + // Add 31 days and do a write + env_->addon_time_.fetch_add(31 * 24 * 60 * 60); + ASSERT_OK(Put("a", "1")); + Flush(); + dbfull()->TEST_WaitForCompact(); + // Assert that the files stay in the same level + ASSERT_EQ("3", FilesPerLevel()); + // The two old files go through the periodic compaction process + ASSERT_EQ(2, periodic_compactions); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } +} + +TEST_F(DBCompactionTest, CompactRangeDelayedByL0FileCount) { + // Verify that, when `CompactRangeOptions::allow_write_stall == false`, manual + // compaction only triggers flush after it's sure stall won't be triggered for + // L0 file count going too high. + const int kNumL0FilesTrigger = 4; + const int kNumL0FilesLimit = 8; + // i == 0: verifies normal case where stall is avoided by delay + // i == 1: verifies no delay in edge case where stall trigger is same as + // compaction trigger, so stall can't be avoided + for (int i = 0; i < 2; ++i) { + Options options = CurrentOptions(); + options.level0_slowdown_writes_trigger = kNumL0FilesLimit; + if (i == 0) { + options.level0_file_num_compaction_trigger = kNumL0FilesTrigger; + } else { + options.level0_file_num_compaction_trigger = kNumL0FilesLimit; + } + Reopen(options); + + if (i == 0) { + // ensure the auto compaction doesn't finish until manual compaction has + // had a chance to be delayed. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait", + "CompactionJob::Run():End"}}); + } else { + // ensure the auto-compaction doesn't finish until manual compaction has + // continued without delay. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:StallWaitDone", + "CompactionJob::Run():End"}}); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + for (int j = 0; j < kNumL0FilesLimit - 1; ++j) { + for (int k = 0; k < 2; ++k) { + ASSERT_OK(Put(Key(k), RandomString(&rnd, 1024))); + } + Flush(); + } + auto manual_compaction_thread = port::Thread([this]() { + CompactRangeOptions cro; + cro.allow_write_stall = false; + db_->CompactRange(cro, nullptr, nullptr); + }); + + manual_compaction_thread.join(); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_GT(NumTableFilesAtLevel(1), 0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } +} + +TEST_F(DBCompactionTest, CompactRangeDelayedByImmMemTableCount) { + // Verify that, when `CompactRangeOptions::allow_write_stall == false`, manual + // compaction only triggers flush after it's sure stall won't be triggered for + // immutable memtable count going too high. + const int kNumImmMemTableLimit = 8; + // i == 0: verifies normal case where stall is avoided by delay + // i == 1: verifies no delay in edge case where stall trigger is same as flush + // trigger, so stall can't be avoided + for (int i = 0; i < 2; ++i) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + // the delay limit is one less than the stop limit. This test focuses on + // avoiding delay limit, but this option sets stop limit, so add one. + options.max_write_buffer_number = kNumImmMemTableLimit + 1; + if (i == 1) { + options.min_write_buffer_number_to_merge = kNumImmMemTableLimit; + } + Reopen(options); + + if (i == 0) { + // ensure the flush doesn't finish until manual compaction has had a + // chance to be delayed. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait", + "FlushJob::WriteLevel0Table"}}); + } else { + // ensure the flush doesn't finish until manual compaction has continued + // without delay. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:StallWaitDone", + "FlushJob::WriteLevel0Table"}}); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + for (int j = 0; j < kNumImmMemTableLimit - 1; ++j) { + ASSERT_OK(Put(Key(0), RandomString(&rnd, 1024))); + FlushOptions flush_opts; + flush_opts.wait = false; + flush_opts.allow_write_stall = true; + dbfull()->Flush(flush_opts); + } + + auto manual_compaction_thread = port::Thread([this]() { + CompactRangeOptions cro; + cro.allow_write_stall = false; + db_->CompactRange(cro, nullptr, nullptr); + }); + + manual_compaction_thread.join(); + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_GT(NumTableFilesAtLevel(1), 0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } +} + +TEST_F(DBCompactionTest, CompactRangeShutdownWhileDelayed) { + // Verify that, when `CompactRangeOptions::allow_write_stall == false`, delay + // does not hang if CF is dropped or DB is closed + const int kNumL0FilesTrigger = 4; + const int kNumL0FilesLimit = 8; + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0FilesTrigger; + options.level0_slowdown_writes_trigger = kNumL0FilesLimit; + // i == 0: DB::DropColumnFamily() on CompactRange's target CF unblocks it + // i == 1: DB::CancelAllBackgroundWork() unblocks CompactRange. This is to + // simulate what happens during Close as we can't call Close (it + // blocks on the auto-compaction, making a cycle). + for (int i = 0; i < 2; ++i) { + CreateAndReopenWithCF({"one"}, options); + // The calls to close CF/DB wait until the manual compaction stalls. + // The auto-compaction waits until the manual compaction finishes to ensure + // the signal comes from closing CF/DB, not from compaction making progress. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait", + "DBCompactionTest::CompactRangeShutdownWhileDelayed:PreShutdown"}, + {"DBCompactionTest::CompactRangeShutdownWhileDelayed:PostManual", + "CompactionJob::Run():End"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + for (int j = 0; j < kNumL0FilesLimit - 1; ++j) { + for (int k = 0; k < 2; ++k) { + ASSERT_OK(Put(1, Key(k), RandomString(&rnd, 1024))); + } + Flush(1); + } + auto manual_compaction_thread = port::Thread([this, i]() { + CompactRangeOptions cro; + cro.allow_write_stall = false; + Status s = db_->CompactRange(cro, handles_[1], nullptr, nullptr); + if (i == 0) { + ASSERT_TRUE(db_->CompactRange(cro, handles_[1], nullptr, nullptr) + .IsColumnFamilyDropped()); + } else { + ASSERT_TRUE(db_->CompactRange(cro, handles_[1], nullptr, nullptr) + .IsShutdownInProgress()); + } + }); + + TEST_SYNC_POINT( + "DBCompactionTest::CompactRangeShutdownWhileDelayed:PreShutdown"); + if (i == 0) { + ASSERT_OK(db_->DropColumnFamily(handles_[1])); + } else { + dbfull()->CancelAllBackgroundWork(false /* wait */); + } + manual_compaction_thread.join(); + TEST_SYNC_POINT( + "DBCompactionTest::CompactRangeShutdownWhileDelayed:PostManual"); + dbfull()->TEST_WaitForCompact(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } +} + +TEST_F(DBCompactionTest, CompactRangeSkipFlushAfterDelay) { + // Verify that, when `CompactRangeOptions::allow_write_stall == false`, + // CompactRange skips its flush if the delay is long enough that the memtables + // existing at the beginning of the call have already been flushed. + const int kNumL0FilesTrigger = 4; + const int kNumL0FilesLimit = 8; + Options options = CurrentOptions(); + options.level0_slowdown_writes_trigger = kNumL0FilesLimit; + options.level0_file_num_compaction_trigger = kNumL0FilesTrigger; + Reopen(options); + + Random rnd(301); + // The manual flush includes the memtable that was active when CompactRange + // began. So it unblocks CompactRange and precludes its flush. Throughout the + // test, stall conditions are upheld via high L0 file count. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait", + "DBCompactionTest::CompactRangeSkipFlushAfterDelay:PreFlush"}, + {"DBCompactionTest::CompactRangeSkipFlushAfterDelay:PostFlush", + "DBImpl::FlushMemTable:StallWaitDone"}, + {"DBImpl::FlushMemTable:StallWaitDone", "CompactionJob::Run():End"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + //used for the delayable flushes + FlushOptions flush_opts; + flush_opts.allow_write_stall = true; + for (int i = 0; i < kNumL0FilesLimit - 1; ++i) { + for (int j = 0; j < 2; ++j) { + ASSERT_OK(Put(Key(j), RandomString(&rnd, 1024))); + } + dbfull()->Flush(flush_opts); + } + auto manual_compaction_thread = port::Thread([this]() { + CompactRangeOptions cro; + cro.allow_write_stall = false; + db_->CompactRange(cro, nullptr, nullptr); + }); + + TEST_SYNC_POINT("DBCompactionTest::CompactRangeSkipFlushAfterDelay:PreFlush"); + Put(ToString(0), RandomString(&rnd, 1024)); + dbfull()->Flush(flush_opts); + Put(ToString(0), RandomString(&rnd, 1024)); + TEST_SYNC_POINT("DBCompactionTest::CompactRangeSkipFlushAfterDelay:PostFlush"); + manual_compaction_thread.join(); + + // If CompactRange's flush was skipped, the final Put above will still be + // in the active memtable. + std::string num_keys_in_memtable; + db_->GetProperty(DB::Properties::kNumEntriesActiveMemTable, &num_keys_in_memtable); + ASSERT_EQ(ToString(1), num_keys_in_memtable); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBCompactionTest, CompactRangeFlushOverlappingMemtable) { + // Verify memtable only gets flushed if it contains data overlapping the range + // provided to `CompactRange`. Tests all kinds of overlap/non-overlap. + const int kNumEndpointKeys = 5; + std::string keys[kNumEndpointKeys] = {"a", "b", "c", "d", "e"}; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + Reopen(options); + + // One extra iteration for nullptr, which means left side of interval is + // unbounded. + for (int i = 0; i <= kNumEndpointKeys; ++i) { + Slice begin; + Slice* begin_ptr; + if (i == 0) { + begin_ptr = nullptr; + } else { + begin = keys[i - 1]; + begin_ptr = &begin; + } + // Start at `i` so right endpoint comes after left endpoint. One extra + // iteration for nullptr, which means right side of interval is unbounded. + for (int j = std::max(0, i - 1); j <= kNumEndpointKeys; ++j) { + Slice end; + Slice* end_ptr; + if (j == kNumEndpointKeys) { + end_ptr = nullptr; + } else { + end = keys[j]; + end_ptr = &end; + } + ASSERT_OK(Put("b", "val")); + ASSERT_OK(Put("d", "val")); + CompactRangeOptions compact_range_opts; + ASSERT_OK(db_->CompactRange(compact_range_opts, begin_ptr, end_ptr)); + + uint64_t get_prop_tmp, num_memtable_entries = 0; + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesImmMemTables, + &get_prop_tmp)); + num_memtable_entries += get_prop_tmp; + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, + &get_prop_tmp)); + num_memtable_entries += get_prop_tmp; + if (begin_ptr == nullptr || end_ptr == nullptr || + (i <= 4 && j >= 1 && (begin != "c" || end != "c"))) { + // In this case `CompactRange`'s range overlapped in some way with the + // memtable's range, so flush should've happened. Then "b" and "d" won't + // be in the memtable. + ASSERT_EQ(0, num_memtable_entries); + } else { + ASSERT_EQ(2, num_memtable_entries); + // flush anyways to prepare for next iteration + db_->Flush(FlushOptions()); + } + } + } +} + +TEST_F(DBCompactionTest, CompactionStatsTest) { + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 2; + CompactionStatsCollector* collector = new CompactionStatsCollector(); + options.listeners.emplace_back(collector); + DestroyAndReopen(options); + + for (int i = 0; i < 32; i++) { + for (int j = 0; j < 5000; j++) { + Put(std::to_string(j), std::string(1, 'A')); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + dbfull()->TEST_WaitForCompact(); + ColumnFamilyHandleImpl* cfh = + static_cast(dbfull()->DefaultColumnFamily()); + ColumnFamilyData* cfd = cfh->cfd(); + + VerifyCompactionStats(*cfd, *collector); +} + +TEST_F(DBCompactionTest, CompactFilesOutputRangeConflict) { + // LSM setup: + // L1: [ba bz] + // L2: [a b] [c d] + // L3: [a b] [c d] + // + // Thread 1: Thread 2: + // Begin compacting all L2->L3 + // Compact [ba bz] L1->L3 + // End compacting all L2->L3 + // + // The compaction operation in thread 2 should be disallowed because the range + // overlaps with the compaction in thread 1, which also covers that range in + // L3. + Options options = CurrentOptions(); + FlushedFileCollector* collector = new FlushedFileCollector(); + options.listeners.emplace_back(collector); + Reopen(options); + + for (int level = 3; level >= 2; --level) { + ASSERT_OK(Put("a", "val")); + ASSERT_OK(Put("b", "val")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("c", "val")); + ASSERT_OK(Put("d", "val")); + ASSERT_OK(Flush()); + MoveFilesToLevel(level); + } + ASSERT_OK(Put("ba", "val")); + ASSERT_OK(Put("bz", "val")); + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + + SyncPoint::GetInstance()->LoadDependency({ + {"CompactFilesImpl:0", + "DBCompactionTest::CompactFilesOutputRangeConflict:Thread2Begin"}, + {"DBCompactionTest::CompactFilesOutputRangeConflict:Thread2End", + "CompactFilesImpl:1"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + + auto bg_thread = port::Thread([&]() { + // Thread 1 + std::vector filenames = collector->GetFlushedFiles(); + filenames.pop_back(); + ASSERT_OK(db_->CompactFiles(CompactionOptions(), filenames, + 3 /* output_level */)); + }); + + // Thread 2 + TEST_SYNC_POINT( + "DBCompactionTest::CompactFilesOutputRangeConflict:Thread2Begin"); + std::string filename = collector->GetFlushedFiles().back(); + ASSERT_FALSE( + db_->CompactFiles(CompactionOptions(), {filename}, 3 /* output_level */) + .ok()); + TEST_SYNC_POINT( + "DBCompactionTest::CompactFilesOutputRangeConflict:Thread2End"); + + bg_thread.join(); +} + +TEST_F(DBCompactionTest, CompactionHasEmptyOutput) { + Options options = CurrentOptions(); + SstStatsCollector* collector = new SstStatsCollector(); + options.level0_file_num_compaction_trigger = 2; + options.listeners.emplace_back(collector); + Reopen(options); + + // Make sure the L0 files overlap to prevent trivial move. + ASSERT_OK(Put("a", "val")); + ASSERT_OK(Put("b", "val")); + ASSERT_OK(Flush()); + ASSERT_OK(Delete("a")); + ASSERT_OK(Delete("b")); + ASSERT_OK(Flush()); + + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + + // Expect one file creation to start for each flush, and zero for compaction + // since no keys are written. + ASSERT_EQ(2, collector->num_ssts_creation_started()); +} + +TEST_F(DBCompactionTest, CompactionLimiter) { + const int kNumKeysPerFile = 10; + const int kMaxBackgroundThreads = 64; + + struct CompactionLimiter { + std::string name; + int limit_tasks; + int max_tasks; + int tasks; + std::shared_ptr limiter; + }; + + std::vector limiter_settings; + limiter_settings.push_back({"limiter_1", 1, 0, 0, nullptr}); + limiter_settings.push_back({"limiter_2", 2, 0, 0, nullptr}); + limiter_settings.push_back({"limiter_3", 3, 0, 0, nullptr}); + + for (auto& ls : limiter_settings) { + ls.limiter.reset(NewConcurrentTaskLimiter(ls.name, ls.limit_tasks)); + } + + std::shared_ptr unique_limiter( + NewConcurrentTaskLimiter("unique_limiter", -1)); + + const char* cf_names[] = {"default", "0", "1", "2", "3", "4", "5", + "6", "7", "8", "9", "a", "b", "c", "d", "e", "f" }; + const unsigned int cf_count = sizeof cf_names / sizeof cf_names[0]; + + std::unordered_map cf_to_limiter; + + Options options = CurrentOptions(); + options.write_buffer_size = 110 * 1024; // 110KB + options.arena_block_size = 4096; + options.num_levels = 3; + options.level0_file_num_compaction_trigger = 4; + options.level0_slowdown_writes_trigger = 64; + options.level0_stop_writes_trigger = 64; + options.max_background_jobs = kMaxBackgroundThreads; // Enough threads + options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile)); + options.max_write_buffer_number = 10; // Enough memtables + DestroyAndReopen(options); + + std::vector option_vector; + option_vector.reserve(cf_count); + + for (unsigned int cf = 0; cf < cf_count; cf++) { + ColumnFamilyOptions cf_opt(options); + if (cf == 0) { + // "Default" CF does't use compaction limiter + cf_opt.compaction_thread_limiter = nullptr; + } else if (cf == 1) { + // "1" CF uses bypass compaction limiter + unique_limiter->SetMaxOutstandingTask(-1); + cf_opt.compaction_thread_limiter = unique_limiter; + } else { + // Assign limiter by mod + auto& ls = limiter_settings[cf % 3]; + cf_opt.compaction_thread_limiter = ls.limiter; + cf_to_limiter[cf_names[cf]] = &ls; + } + option_vector.emplace_back(DBOptions(options), cf_opt); + } + + for (unsigned int cf = 1; cf < cf_count; cf++) { + CreateColumnFamilies({cf_names[cf]}, option_vector[cf]); + } + + ReopenWithColumnFamilies(std::vector(cf_names, + cf_names + cf_count), + option_vector); + + port::Mutex mutex; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:BeforeCompaction", [&](void* arg) { + const auto& cf_name = static_cast(arg)->GetName(); + auto iter = cf_to_limiter.find(cf_name); + if (iter != cf_to_limiter.end()) { + MutexLock l(&mutex); + ASSERT_GE(iter->second->limit_tasks, ++iter->second->tasks); + iter->second->max_tasks = + std::max(iter->second->max_tasks, iter->second->limit_tasks); + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:AfterCompaction", [&](void* arg) { + const auto& cf_name = static_cast(arg)->GetName(); + auto iter = cf_to_limiter.find(cf_name); + if (iter != cf_to_limiter.end()) { + MutexLock l(&mutex); + ASSERT_GE(--iter->second->tasks, 0); + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Block all compact threads in thread pool. + const size_t kTotalFlushTasks = kMaxBackgroundThreads / 4; + const size_t kTotalCompactTasks = kMaxBackgroundThreads - kTotalFlushTasks; + env_->SetBackgroundThreads((int)kTotalFlushTasks, Env::HIGH); + env_->SetBackgroundThreads((int)kTotalCompactTasks, Env::LOW); + + test::SleepingBackgroundTask sleeping_compact_tasks[kTotalCompactTasks]; + + // Block all compaction threads in thread pool. + for (size_t i = 0; i < kTotalCompactTasks; i++) { + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_compact_tasks[i], Env::LOW); + sleeping_compact_tasks[i].WaitUntilSleeping(); + } + + int keyIndex = 0; + + for (int n = 0; n < options.level0_file_num_compaction_trigger; n++) { + for (unsigned int cf = 0; cf < cf_count; cf++) { + for (int i = 0; i < kNumKeysPerFile; i++) { + ASSERT_OK(Put(cf, Key(keyIndex++), "")); + } + // put extra key to trigger flush + ASSERT_OK(Put(cf, "", "")); + } + + for (unsigned int cf = 0; cf < cf_count; cf++) { + dbfull()->TEST_WaitForFlushMemTable(handles_[cf]); + } + } + + // Enough L0 files to trigger compaction + for (unsigned int cf = 0; cf < cf_count; cf++) { + ASSERT_EQ(NumTableFilesAtLevel(0, cf), + options.level0_file_num_compaction_trigger); + } + + // Create more files for one column family, which triggers speed up + // condition, all compactions will be scheduled. + for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) { + for (int i = 0; i < kNumKeysPerFile; i++) { + ASSERT_OK(Put(0, Key(i), "")); + } + // put extra key to trigger flush + ASSERT_OK(Put(0, "", "")); + dbfull()->TEST_WaitForFlushMemTable(handles_[0]); + ASSERT_EQ(options.level0_file_num_compaction_trigger + num + 1, + NumTableFilesAtLevel(0, 0)); + } + + // All CFs are pending compaction + ASSERT_EQ(cf_count, env_->GetThreadPoolQueueLen(Env::LOW)); + + // Unblock all compaction threads + for (size_t i = 0; i < kTotalCompactTasks; i++) { + sleeping_compact_tasks[i].WakeUp(); + sleeping_compact_tasks[i].WaitUntilDone(); + } + + for (unsigned int cf = 0; cf < cf_count; cf++) { + dbfull()->TEST_WaitForFlushMemTable(handles_[cf]); + } + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // Max outstanding compact tasks reached limit + for (auto& ls : limiter_settings) { + ASSERT_EQ(ls.limit_tasks, ls.max_tasks); + ASSERT_EQ(0, ls.limiter->GetOutstandingTask()); + } + + // test manual compaction under a fully throttled limiter + int cf_test = 1; + unique_limiter->SetMaxOutstandingTask(0); + + // flush one more file to cf 1 + for (int i = 0; i < kNumKeysPerFile; i++) { + ASSERT_OK(Put(cf_test, Key(keyIndex++), "")); + } + // put extra key to trigger flush + ASSERT_OK(Put(cf_test, "", "")); + + dbfull()->TEST_WaitForFlushMemTable(handles_[cf_test]); + ASSERT_EQ(1, NumTableFilesAtLevel(0, cf_test)); + + Compact(cf_test, Key(0), Key(keyIndex)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); +} + +INSTANTIATE_TEST_CASE_P(DBCompactionTestWithParam, DBCompactionTestWithParam, + ::testing::Values(std::make_tuple(1, true), + std::make_tuple(1, false), + std::make_tuple(4, true), + std::make_tuple(4, false))); + +TEST_P(DBCompactionDirectIOTest, DirectIO) { + Options options = CurrentOptions(); + Destroy(options); + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.use_direct_io_for_flush_and_compaction = GetParam(); + options.env = new MockEnv(Env::Default()); + Reopen(options); + bool readahead = false; + SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::OpenCompactionOutputFile", [&](void* arg) { + bool* use_direct_writes = static_cast(arg); + ASSERT_EQ(*use_direct_writes, + options.use_direct_io_for_flush_and_compaction); + }); + if (options.use_direct_io_for_flush_and_compaction) { + SyncPoint::GetInstance()->SetCallBack( + "SanitizeOptions:direct_io", [&](void* /*arg*/) { + readahead = true; + }); + } + SyncPoint::GetInstance()->EnableProcessing(); + CreateAndReopenWithCF({"pikachu"}, options); + MakeTables(3, "p", "q", 1); + ASSERT_EQ("1,1,1", FilesPerLevel(1)); + Compact(1, "p1", "p9"); + ASSERT_EQ(readahead, options.use_direct_reads); + ASSERT_EQ("0,0,1", FilesPerLevel(1)); + Destroy(options); + delete options.env; +} + +INSTANTIATE_TEST_CASE_P(DBCompactionDirectIOTest, DBCompactionDirectIOTest, + testing::Bool()); + +class CompactionPriTest : public DBTestBase, + public testing::WithParamInterface { + public: + CompactionPriTest() : DBTestBase("/compaction_pri_test") { + compaction_pri_ = GetParam(); + } + + // Required if inheriting from testing::WithParamInterface<> + static void SetUpTestCase() {} + static void TearDownTestCase() {} + + uint32_t compaction_pri_; +}; + +TEST_P(CompactionPriTest, Test) { + Options options = CurrentOptions(); + options.write_buffer_size = 16 * 1024; + options.compaction_pri = static_cast(compaction_pri_); + options.hard_pending_compaction_bytes_limit = 256 * 1024; + options.max_bytes_for_level_base = 64 * 1024; + options.max_bytes_for_level_multiplier = 4; + options.compression = kNoCompression; + + DestroyAndReopen(options); + + Random rnd(301); + const int kNKeys = 5000; + int keys[kNKeys]; + for (int i = 0; i < kNKeys; i++) { + keys[i] = i; + } + std::random_shuffle(std::begin(keys), std::end(keys)); + + for (int i = 0; i < kNKeys; i++) { + ASSERT_OK(Put(Key(keys[i]), RandomString(&rnd, 102))); + } + + dbfull()->TEST_WaitForCompact(); + for (int i = 0; i < kNKeys; i++) { + ASSERT_NE("NOT_FOUND", Get(Key(i))); + } +} + +INSTANTIATE_TEST_CASE_P( + CompactionPriTest, CompactionPriTest, + ::testing::Values(CompactionPri::kByCompensatedSize, + CompactionPri::kOldestLargestSeqFirst, + CompactionPri::kOldestSmallestSeqFirst, + CompactionPri::kMinOverlappingRatio)); + +class NoopMergeOperator : public MergeOperator { + public: + NoopMergeOperator() {} + + bool FullMergeV2(const MergeOperationInput& /*merge_in*/, + MergeOperationOutput* merge_out) const override { + std::string val("bar"); + merge_out->new_value = val; + return true; + } + + const char* Name() const override { return "Noop"; } +}; + +TEST_F(DBCompactionTest, PartialManualCompaction) { + Options opts = CurrentOptions(); + opts.num_levels = 3; + opts.level0_file_num_compaction_trigger = 10; + opts.compression = kNoCompression; + opts.merge_operator.reset(new NoopMergeOperator()); + opts.target_file_size_base = 10240; + DestroyAndReopen(opts); + + Random rnd(301); + for (auto i = 0; i < 8; ++i) { + for (auto j = 0; j < 10; ++j) { + Merge("foo", RandomString(&rnd, 1024)); + } + Flush(); + } + + MoveFilesToLevel(2); + + std::string prop; + EXPECT_TRUE(dbfull()->GetProperty(DB::Properties::kLiveSstFilesSize, &prop)); + uint64_t max_compaction_bytes = atoi(prop.c_str()) / 2; + ASSERT_OK(dbfull()->SetOptions( + {{"max_compaction_bytes", std::to_string(max_compaction_bytes)}})); + + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; + dbfull()->CompactRange(cro, nullptr, nullptr); +} + +TEST_F(DBCompactionTest, ManualCompactionFailsInReadOnlyMode) { + // Regression test for bug where manual compaction hangs forever when the DB + // is in read-only mode. Verify it now at least returns, despite failing. + const int kNumL0Files = 4; + std::unique_ptr mock_env( + new FaultInjectionTestEnv(Env::Default())); + Options opts = CurrentOptions(); + opts.disable_auto_compactions = true; + opts.env = mock_env.get(); + DestroyAndReopen(opts); + + Random rnd(301); + for (int i = 0; i < kNumL0Files; ++i) { + // Make sure files are overlapping in key-range to prevent trivial move. + Put("key1", RandomString(&rnd, 1024)); + Put("key2", RandomString(&rnd, 1024)); + Flush(); + } + ASSERT_EQ(kNumL0Files, NumTableFilesAtLevel(0)); + + // Enter read-only mode by failing a write. + mock_env->SetFilesystemActive(false); + // Make sure this is outside `CompactRange`'s range so that it doesn't fail + // early trying to flush memtable. + ASSERT_NOK(Put("key3", RandomString(&rnd, 1024))); + + // In the bug scenario, the first manual compaction would fail and forget to + // unregister itself, causing the second one to hang forever due to conflict + // with a non-running compaction. + CompactRangeOptions cro; + cro.exclusive_manual_compaction = false; + Slice begin_key("key1"); + Slice end_key("key2"); + ASSERT_NOK(dbfull()->CompactRange(cro, &begin_key, &end_key)); + ASSERT_NOK(dbfull()->CompactRange(cro, &begin_key, &end_key)); + + // Close before mock_env destruct. + Close(); +} + +// ManualCompactionBottomLevelOptimization tests the bottom level manual +// compaction optimization to skip recompacting files created by Ln-1 to Ln +// compaction +TEST_F(DBCompactionTest, ManualCompactionBottomLevelOptimized) { + Options opts = CurrentOptions(); + opts.num_levels = 3; + opts.level0_file_num_compaction_trigger = 5; + opts.compression = kNoCompression; + opts.merge_operator.reset(new NoopMergeOperator()); + opts.target_file_size_base = 1024; + opts.max_bytes_for_level_multiplier = 2; + opts.disable_auto_compactions = true; + DestroyAndReopen(opts); + ColumnFamilyHandleImpl* cfh = + static_cast(dbfull()->DefaultColumnFamily()); + ColumnFamilyData* cfd = cfh->cfd(); + InternalStats* internal_stats_ptr = cfd->internal_stats(); + ASSERT_NE(internal_stats_ptr, nullptr); + + Random rnd(301); + for (auto i = 0; i < 8; ++i) { + for (auto j = 0; j < 10; ++j) { + ASSERT_OK( + Put("foo" + std::to_string(i * 10 + j), RandomString(&rnd, 1024))); + } + Flush(); + } + + MoveFilesToLevel(2); + + for (auto i = 0; i < 8; ++i) { + for (auto j = 0; j < 10; ++j) { + ASSERT_OK( + Put("bar" + std::to_string(i * 10 + j), RandomString(&rnd, 1024))); + } + Flush(); + } + const std::vector& comp_stats = + internal_stats_ptr->TEST_GetCompactionStats(); + int num = comp_stats[2].num_input_files_in_output_level; + ASSERT_EQ(num, 0); + + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; + dbfull()->CompactRange(cro, nullptr, nullptr); + + const std::vector& comp_stats2 = + internal_stats_ptr->TEST_GetCompactionStats(); + num = comp_stats2[2].num_input_files_in_output_level; + ASSERT_EQ(num, 0); +} + +TEST_F(DBCompactionTest, CompactionDuringShutdown) { + Options opts = CurrentOptions(); + opts.level0_file_num_compaction_trigger = 2; + opts.disable_auto_compactions = true; + DestroyAndReopen(opts); + ColumnFamilyHandleImpl* cfh = + static_cast(dbfull()->DefaultColumnFamily()); + ColumnFamilyData* cfd = cfh->cfd(); + InternalStats* internal_stats_ptr = cfd->internal_stats(); + ASSERT_NE(internal_stats_ptr, nullptr); + + Random rnd(301); + for (auto i = 0; i < 2; ++i) { + for (auto j = 0; j < 10; ++j) { + ASSERT_OK( + Put("foo" + std::to_string(i * 10 + j), RandomString(&rnd, 1024))); + } + Flush(); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", + [&](void* /*arg*/) { dbfull()->shutting_down_.store(true); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->error_handler_.GetBGError()); +} + +// FixFileIngestionCompactionDeadlock tests and verifies that compaction and +// file ingestion do not cause deadlock in the event of write stall triggered +// by number of L0 files reaching level0_stop_writes_trigger. +TEST_P(DBCompactionTestWithParam, FixFileIngestionCompactionDeadlock) { + const int kNumKeysPerFile = 100; + // Generate SST files. + Options options = CurrentOptions(); + + // Generate an external SST file containing a single key, i.e. 99 + std::string sst_files_dir = dbname_ + "/sst_files/"; + test::DestroyDir(env_, sst_files_dir); + ASSERT_OK(env_->CreateDir(sst_files_dir)); + SstFileWriter sst_writer(EnvOptions(), options); + const std::string sst_file_path = sst_files_dir + "test.sst"; + ASSERT_OK(sst_writer.Open(sst_file_path)); + ASSERT_OK(sst_writer.Put(Key(kNumKeysPerFile - 1), "value")); + ASSERT_OK(sst_writer.Finish()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->LoadDependency({ + {"DBImpl::IngestExternalFile:AfterIncIngestFileCounter", + "BackgroundCallCompaction:0"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + + options.write_buffer_size = 110 << 10; // 110KB + options.level0_file_num_compaction_trigger = + options.level0_stop_writes_trigger; + options.max_subcompactions = max_subcompactions_; + options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + Random rnd(301); + + // Generate level0_stop_writes_trigger L0 files to trigger write stop + for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) { + for (int j = 0; j != kNumKeysPerFile; ++j) { + ASSERT_OK(Put(Key(j), RandomString(&rnd, 990))); + } + if (0 == i) { + // When we reach here, the memtables have kNumKeysPerFile keys. Note that + // flush is not yet triggered. We need to write an extra key so that the + // write path will call PreprocessWrite and flush the previous key-value + // pairs to e flushed. After that, there will be the newest key in the + // memtable, and a bunch of L0 files. Since there is already one key in + // the memtable, then for i = 1, 2, ..., we do not have to write this + // extra key to trigger flush. + ASSERT_OK(Put("", "")); + } + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(NumTableFilesAtLevel(0 /*level*/, 0 /*cf*/), i + 1); + } + // When we reach this point, there will be level0_stop_writes_trigger L0 + // files and one extra key (99) in memory, which overlaps with the external + // SST file. Write stall triggers, and can be cleared only after compaction + // reduces the number of L0 files. + + // Compaction will also be triggered since we have reached the threshold for + // auto compaction. Note that compaction may begin after the following file + // ingestion thread and waits for ingestion to finish. + + // Thread to ingest file with overlapping key range with the current + // memtable. Consequently ingestion will trigger a flush. The flush MUST + // proceed without waiting for the write stall condition to clear, otherwise + // deadlock can happen. + port::Thread ingestion_thr([&]() { + IngestExternalFileOptions ifo; + Status s = db_->IngestExternalFile({sst_file_path}, ifo); + ASSERT_OK(s); + }); + + // More write to trigger write stop + ingestion_thr.join(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + Close(); +} + +TEST_F(DBCompactionTest, ConsistencyFailTest) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "VersionBuilder::CheckConsistency", [&](void* arg) { + auto p = + reinterpret_cast*>(arg); + // just swap the two FileMetaData so that we hit error + // in CheckConsistency funcion + FileMetaData* temp = *(p->first); + *(p->first) = *(p->second); + *(p->second) = temp; + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + for (int k = 0; k < 2; ++k) { + ASSERT_OK(Put("foo", "bar")); + Flush(); + } + + ASSERT_NOK(Put("foo", "bar")); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +void IngestOneKeyValue(DBImpl* db, const std::string& key, + const std::string& value, const Options& options) { + ExternalSstFileInfo info; + std::string f = test::PerThreadDBPath("sst_file" + key); + EnvOptions env; + ROCKSDB_NAMESPACE::SstFileWriter writer(env, options); + auto s = writer.Open(f); + ASSERT_OK(s); + // ASSERT_OK(writer.Put(Key(), "")); + ASSERT_OK(writer.Put(key, value)); + + ASSERT_OK(writer.Finish(&info)); + IngestExternalFileOptions ingest_opt; + + ASSERT_OK(db->IngestExternalFile({info.file_path}, ingest_opt)); +} + +TEST_P(DBCompactionTestWithParam, + FlushAfterIntraL0CompactionCheckConsistencyFail) { + Options options = CurrentOptions(); + options.force_consistency_checks = true; + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = 5; + options.max_background_compactions = 2; + options.max_subcompactions = max_subcompactions_; + DestroyAndReopen(options); + + const size_t kValueSize = 1 << 20; + Random rnd(301); + std::atomic pick_intra_l0_count(0); + std::string value(RandomString(&rnd, kValueSize)); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBCompactionTestWithParam::FlushAfterIntraL0:1", + "CompactionJob::Run():Start"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "FindIntraL0Compaction", + [&](void* /*arg*/) { pick_intra_l0_count.fetch_add(1); }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // prevents trivial move + for (int i = 0; i < 10; ++i) { + ASSERT_OK(Put(Key(i), "")); // prevents trivial move + } + ASSERT_OK(Flush()); + Compact("", Key(99)); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + + // Flush 5 L0 sst. + for (int i = 0; i < 5; ++i) { + ASSERT_OK(Put(Key(i + 1), value)); + ASSERT_OK(Flush()); + } + ASSERT_EQ(5, NumTableFilesAtLevel(0)); + + // Put one key, to make smallest log sequence number in this memtable is less + // than sst which would be ingested in next step. + ASSERT_OK(Put(Key(0), "a")); + + ASSERT_EQ(5, NumTableFilesAtLevel(0)); + + // Ingest 5 L0 sst. And this files would trigger PickIntraL0Compaction. + for (int i = 5; i < 10; i++) { + IngestOneKeyValue(dbfull(), Key(i), value, options); + ASSERT_EQ(i + 1, NumTableFilesAtLevel(0)); + } + + TEST_SYNC_POINT("DBCompactionTestWithParam::FlushAfterIntraL0:1"); + // Put one key, to make biggest log sequence number in this memtable is bigger + // than sst which would be ingested in next step. + ASSERT_OK(Put(Key(2), "b")); + ASSERT_EQ(10, NumTableFilesAtLevel(0)); + dbfull()->TEST_WaitForCompact(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + std::vector> level_to_files; + dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), + &level_to_files); + ASSERT_GT(level_to_files[0].size(), 0); + ASSERT_GT(pick_intra_l0_count.load(), 0); + + ASSERT_OK(Flush()); +} + +TEST_P(DBCompactionTestWithParam, + IntraL0CompactionAfterFlushCheckConsistencyFail) { + Options options = CurrentOptions(); + options.force_consistency_checks = true; + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = 5; + options.max_background_compactions = 2; + options.max_subcompactions = max_subcompactions_; + options.write_buffer_size = 2 << 20; + options.max_write_buffer_number = 6; + DestroyAndReopen(options); + + const size_t kValueSize = 1 << 20; + Random rnd(301); + std::string value(RandomString(&rnd, kValueSize)); + std::string value2(RandomString(&rnd, kValueSize)); + std::string bigvalue = value + value; + + // prevents trivial move + for (int i = 0; i < 10; ++i) { + ASSERT_OK(Put(Key(i), "")); // prevents trivial move + } + ASSERT_OK(Flush()); + Compact("", Key(99)); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + + std::atomic pick_intra_l0_count(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBCompactionTestWithParam::IntraL0CompactionAfterFlush:1", + "CompactionJob::Run():Start"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "FindIntraL0Compaction", + [&](void* /*arg*/) { pick_intra_l0_count.fetch_add(1); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + // Make 6 L0 sst. + for (int i = 0; i < 6; ++i) { + if (i % 2 == 0) { + IngestOneKeyValue(dbfull(), Key(i), value, options); + } else { + ASSERT_OK(Put(Key(i), value)); + ASSERT_OK(Flush()); + } + } + + ASSERT_EQ(6, NumTableFilesAtLevel(0)); + + // Stop run flush job + env_->SetBackgroundThreads(1, Env::HIGH); + test::SleepingBackgroundTask sleeping_tasks; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_tasks, + Env::Priority::HIGH); + sleeping_tasks.WaitUntilSleeping(); + + // Put many keys to make memtable request to flush + for (int i = 0; i < 6; ++i) { + ASSERT_OK(Put(Key(i), bigvalue)); + } + + ASSERT_EQ(6, NumTableFilesAtLevel(0)); + // ingest file to trigger IntraL0Compaction + for (int i = 6; i < 10; ++i) { + ASSERT_EQ(i, NumTableFilesAtLevel(0)); + IngestOneKeyValue(dbfull(), Key(i), value2, options); + } + ASSERT_EQ(10, NumTableFilesAtLevel(0)); + + // Wake up flush job + sleeping_tasks.WakeUp(); + sleeping_tasks.WaitUntilDone(); + TEST_SYNC_POINT("DBCompactionTestWithParam::IntraL0CompactionAfterFlush:1"); + dbfull()->TEST_WaitForCompact(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + uint64_t error_count = 0; + db_->GetIntProperty("rocksdb.background-errors", &error_count); + ASSERT_EQ(error_count, 0); + ASSERT_GT(pick_intra_l0_count.load(), 0); + for (int i = 0; i < 6; ++i) { + ASSERT_EQ(bigvalue, Get(Key(i))); + } + for (int i = 6; i < 10; ++i) { + ASSERT_EQ(value2, Get(Key(i))); + } +} + +#endif // !defined(ROCKSDB_LITE) +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { +#if !defined(ROCKSDB_LITE) + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +#else + (void) argc; + (void) argv; + return 0; +#endif +} diff --git a/src/rocksdb/db/db_dynamic_level_test.cc b/src/rocksdb/db/db_dynamic_level_test.cc new file mode 100644 index 000000000..c26657701 --- /dev/null +++ b/src/rocksdb/db/db_dynamic_level_test.cc @@ -0,0 +1,505 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// Introduction of SyncPoint effectively disabled building and running this test +// in Release build. +// which is a pity, it is a good test +#if !defined(ROCKSDB_LITE) + +#include "db/db_test_util.h" +#include "port/port.h" +#include "port/stack_trace.h" + +namespace ROCKSDB_NAMESPACE { +class DBTestDynamicLevel : public DBTestBase { + public: + DBTestDynamicLevel() : DBTestBase("/db_dynamic_level_test") {} +}; + +TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase) { + if (!Snappy_Supported() || !LZ4_Supported()) { + return; + } + // Use InMemoryEnv, or it would be too slow. + std::unique_ptr env(new MockEnv(env_)); + + const int kNKeys = 1000; + int keys[kNKeys]; + + auto verify_func = [&]() { + for (int i = 0; i < kNKeys; i++) { + ASSERT_NE("NOT_FOUND", Get(Key(i))); + ASSERT_NE("NOT_FOUND", Get(Key(kNKeys * 2 + i))); + if (i < kNKeys / 10) { + ASSERT_EQ("NOT_FOUND", Get(Key(kNKeys + keys[i]))); + } else { + ASSERT_NE("NOT_FOUND", Get(Key(kNKeys + keys[i]))); + } + } + }; + + Random rnd(301); + for (int ordered_insert = 0; ordered_insert <= 1; ordered_insert++) { + for (int i = 0; i < kNKeys; i++) { + keys[i] = i; + } + if (ordered_insert == 0) { + std::random_shuffle(std::begin(keys), std::end(keys)); + } + for (int max_background_compactions = 1; max_background_compactions < 4; + max_background_compactions += 2) { + Options options; + options.env = env.get(); + options.create_if_missing = true; + options.write_buffer_size = 2048; + options.max_write_buffer_number = 2; + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 2; + options.level0_stop_writes_trigger = 2; + options.target_file_size_base = 2048; + options.level_compaction_dynamic_level_bytes = true; + options.max_bytes_for_level_base = 10240; + options.max_bytes_for_level_multiplier = 4; + options.soft_rate_limit = 1.1; + options.max_background_compactions = max_background_compactions; + options.num_levels = 5; + + options.compression_per_level.resize(3); + options.compression_per_level[0] = kNoCompression; + options.compression_per_level[1] = kLZ4Compression; + options.compression_per_level[2] = kSnappyCompression; + options.env = env_; + + DestroyAndReopen(options); + + for (int i = 0; i < kNKeys; i++) { + int key = keys[i]; + ASSERT_OK(Put(Key(kNKeys + key), RandomString(&rnd, 102))); + ASSERT_OK(Put(Key(key), RandomString(&rnd, 102))); + ASSERT_OK(Put(Key(kNKeys * 2 + key), RandomString(&rnd, 102))); + ASSERT_OK(Delete(Key(kNKeys + keys[i / 10]))); + env_->SleepForMicroseconds(5000); + } + + uint64_t int_prop; + ASSERT_TRUE(db_->GetIntProperty("rocksdb.background-errors", &int_prop)); + ASSERT_EQ(0U, int_prop); + + // Verify DB + for (int j = 0; j < 2; j++) { + verify_func(); + if (j == 0) { + Reopen(options); + } + } + + // Test compact range works + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + // All data should be in the last level. + ColumnFamilyMetaData cf_meta; + db_->GetColumnFamilyMetaData(&cf_meta); + ASSERT_EQ(5U, cf_meta.levels.size()); + for (int i = 0; i < 4; i++) { + ASSERT_EQ(0U, cf_meta.levels[i].files.size()); + } + ASSERT_GT(cf_meta.levels[4U].files.size(), 0U); + verify_func(); + + Close(); + } + } + + env_->SetBackgroundThreads(1, Env::LOW); + env_->SetBackgroundThreads(1, Env::HIGH); +} + +// Test specific cases in dynamic max bytes +TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) { + Random rnd(301); + int kMaxKey = 1000000; + + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.create_if_missing = true; + options.write_buffer_size = 20480; + options.max_write_buffer_number = 2; + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 9999; + options.level0_stop_writes_trigger = 9999; + options.target_file_size_base = 9102; + options.level_compaction_dynamic_level_bytes = true; + options.max_bytes_for_level_base = 40960; + options.max_bytes_for_level_multiplier = 4; + options.max_background_compactions = 2; + options.num_levels = 5; + options.max_compaction_bytes = 0; // Force not expanding in compactions + BlockBasedTableOptions table_options; + table_options.block_size = 1024; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + DestroyAndReopen(options); + ASSERT_OK(dbfull()->SetOptions({ + {"disable_auto_compactions", "true"}, + })); + + uint64_t int_prop; + std::string str_prop; + + // Initial base level is the last level + ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop)); + ASSERT_EQ(4U, int_prop); + + // Put about 28K to L0 + for (int i = 0; i < 70; i++) { + ASSERT_OK(Put(Key(static_cast(rnd.Uniform(kMaxKey))), + RandomString(&rnd, 380))); + } + ASSERT_OK(dbfull()->SetOptions({ + {"disable_auto_compactions", "false"}, + })); + Flush(); + dbfull()->TEST_WaitForCompact(); + ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop)); + ASSERT_EQ(4U, int_prop); + + // Insert extra about 28K to L0. After they are compacted to L4, the base + // level should be changed to L3. + ASSERT_OK(dbfull()->SetOptions({ + {"disable_auto_compactions", "true"}, + })); + for (int i = 0; i < 70; i++) { + ASSERT_OK(Put(Key(static_cast(rnd.Uniform(kMaxKey))), + RandomString(&rnd, 380))); + } + + ASSERT_OK(dbfull()->SetOptions({ + {"disable_auto_compactions", "false"}, + })); + Flush(); + dbfull()->TEST_WaitForCompact(); + ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop)); + ASSERT_EQ(3U, int_prop); + ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level1", &str_prop)); + ASSERT_EQ("0", str_prop); + ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level2", &str_prop)); + ASSERT_EQ("0", str_prop); + + // Write even more data while leaving the base level at L3. + ASSERT_OK(dbfull()->SetOptions({ + {"disable_auto_compactions", "true"}, + })); + // Write about 40K more + for (int i = 0; i < 100; i++) { + ASSERT_OK(Put(Key(static_cast(rnd.Uniform(kMaxKey))), + RandomString(&rnd, 380))); + } + ASSERT_OK(dbfull()->SetOptions({ + {"disable_auto_compactions", "false"}, + })); + Flush(); + dbfull()->TEST_WaitForCompact(); + ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop)); + ASSERT_EQ(3U, int_prop); + + // Fill up L0, and then run an (auto) L0->Lmax compaction to raise the base + // level to 2. + ASSERT_OK(dbfull()->SetOptions({ + {"disable_auto_compactions", "true"}, + })); + // Write about 650K more. + // Each file is about 11KB, with 9KB of data. + for (int i = 0; i < 1300; i++) { + ASSERT_OK(Put(Key(static_cast(rnd.Uniform(kMaxKey))), + RandomString(&rnd, 380))); + } + + // Make sure that the compaction starts before the last bit of data is + // flushed, so that the base level isn't raised to L1. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"CompactionJob::Run():Start", "DynamicLevelMaxBytesBase2:0"}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(dbfull()->SetOptions({ + {"disable_auto_compactions", "false"}, + })); + + TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:0"); + Flush(); + dbfull()->TEST_WaitForCompact(); + ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop)); + ASSERT_EQ(2U, int_prop); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + + // Write more data until the base level changes to L1. There will be + // a manual compaction going on at the same time. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"CompactionJob::Run():Start", "DynamicLevelMaxBytesBase2:1"}, + {"DynamicLevelMaxBytesBase2:2", "CompactionJob::Run():End"}, + {"DynamicLevelMaxBytesBase2:compact_range_finish", + "FlushJob::WriteLevel0Table"}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::port::Thread thread([this] { + TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:compact_range_start"); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:compact_range_finish"); + }); + + TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:1"); + for (int i = 0; i < 2; i++) { + ASSERT_OK(Put(Key(static_cast(rnd.Uniform(kMaxKey))), + RandomString(&rnd, 380))); + } + TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:2"); + + Flush(); + + thread.join(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + + ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop)); + ASSERT_EQ(1U, int_prop); +} + +// Test specific cases in dynamic max bytes +TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesCompactRange) { + Random rnd(301); + int kMaxKey = 1000000; + + Options options = CurrentOptions(); + options.create_if_missing = true; + options.write_buffer_size = 2048; + options.max_write_buffer_number = 2; + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 9999; + options.level0_stop_writes_trigger = 9999; + options.target_file_size_base = 2; + options.level_compaction_dynamic_level_bytes = true; + options.max_bytes_for_level_base = 10240; + options.max_bytes_for_level_multiplier = 4; + options.max_background_compactions = 1; + const int kNumLevels = 5; + options.num_levels = kNumLevels; + options.max_compaction_bytes = 1; // Force not expanding in compactions + BlockBasedTableOptions table_options; + table_options.block_size = 1024; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + DestroyAndReopen(options); + + // Compact against empty DB + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + + uint64_t int_prop; + std::string str_prop; + + // Initial base level is the last level + ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop)); + ASSERT_EQ(4U, int_prop); + + // Put about 7K to L0 + for (int i = 0; i < 140; i++) { + ASSERT_OK(Put(Key(static_cast(rnd.Uniform(kMaxKey))), + RandomString(&rnd, 80))); + } + Flush(); + dbfull()->TEST_WaitForCompact(); + if (NumTableFilesAtLevel(0) == 0) { + // Make sure level 0 is not empty + ASSERT_OK(Put(Key(static_cast(rnd.Uniform(kMaxKey))), + RandomString(&rnd, 80))); + Flush(); + } + + ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop)); + ASSERT_EQ(3U, int_prop); + ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level1", &str_prop)); + ASSERT_EQ("0", str_prop); + ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level2", &str_prop)); + ASSERT_EQ("0", str_prop); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + + std::set output_levels; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionPicker::CompactRange:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + output_levels.insert(compaction->output_level()); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_EQ(output_levels.size(), 2); + ASSERT_TRUE(output_levels.find(3) != output_levels.end()); + ASSERT_TRUE(output_levels.find(4) != output_levels.end()); + ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level0", &str_prop)); + ASSERT_EQ("0", str_prop); + ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level3", &str_prop)); + ASSERT_EQ("0", str_prop); + // Base level is still level 3. + ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop)); + ASSERT_EQ(3U, int_prop); +} + +TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBaseInc) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.write_buffer_size = 2048; + options.max_write_buffer_number = 2; + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 2; + options.level0_stop_writes_trigger = 2; + options.target_file_size_base = 2048; + options.level_compaction_dynamic_level_bytes = true; + options.max_bytes_for_level_base = 10240; + options.max_bytes_for_level_multiplier = 4; + options.soft_rate_limit = 1.1; + options.max_background_compactions = 2; + options.num_levels = 5; + options.max_compaction_bytes = 100000000; + + DestroyAndReopen(options); + + int non_trivial = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial", + [&](void* /*arg*/) { non_trivial++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + const int total_keys = 3000; + const int random_part_size = 100; + for (int i = 0; i < total_keys; i++) { + std::string value = RandomString(&rnd, random_part_size); + PutFixed32(&value, static_cast(i)); + ASSERT_OK(Put(Key(i), value)); + } + Flush(); + dbfull()->TEST_WaitForCompact(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + ASSERT_EQ(non_trivial, 0); + + for (int i = 0; i < total_keys; i++) { + std::string value = Get(Key(i)); + ASSERT_EQ(DecodeFixed32(value.c_str() + random_part_size), + static_cast(i)); + } + + env_->SetBackgroundThreads(1, Env::LOW); + env_->SetBackgroundThreads(1, Env::HIGH); +} + +TEST_F(DBTestDynamicLevel, DISABLED_MigrateToDynamicLevelMaxBytesBase) { + Random rnd(301); + const int kMaxKey = 2000; + + Options options; + options.create_if_missing = true; + options.write_buffer_size = 2048; + options.max_write_buffer_number = 8; + options.level0_file_num_compaction_trigger = 4; + options.level0_slowdown_writes_trigger = 4; + options.level0_stop_writes_trigger = 8; + options.target_file_size_base = 2048; + options.level_compaction_dynamic_level_bytes = false; + options.max_bytes_for_level_base = 10240; + options.max_bytes_for_level_multiplier = 4; + options.soft_rate_limit = 1.1; + options.num_levels = 8; + + DestroyAndReopen(options); + + auto verify_func = [&](int num_keys, bool if_sleep) { + for (int i = 0; i < num_keys; i++) { + ASSERT_NE("NOT_FOUND", Get(Key(kMaxKey + i))); + if (i < num_keys / 10) { + ASSERT_EQ("NOT_FOUND", Get(Key(i))); + } else { + ASSERT_NE("NOT_FOUND", Get(Key(i))); + } + if (if_sleep && i % 1000 == 0) { + // Without it, valgrind may choose not to give another + // thread a chance to run before finishing the function, + // causing the test to be extremely slow. + env_->SleepForMicroseconds(1); + } + } + }; + + int total_keys = 1000; + for (int i = 0; i < total_keys; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 102))); + ASSERT_OK(Put(Key(kMaxKey + i), RandomString(&rnd, 102))); + ASSERT_OK(Delete(Key(i / 10))); + } + verify_func(total_keys, false); + dbfull()->TEST_WaitForCompact(); + + options.level_compaction_dynamic_level_bytes = true; + options.disable_auto_compactions = true; + Reopen(options); + verify_func(total_keys, false); + + std::atomic_bool compaction_finished; + compaction_finished = false; + // Issue manual compaction in one thread and still verify DB state + // in main thread. + ROCKSDB_NAMESPACE::port::Thread t([&]() { + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = options.num_levels - 1; + dbfull()->CompactRange(compact_options, nullptr, nullptr); + compaction_finished.store(true); + }); + do { + verify_func(total_keys, true); + } while (!compaction_finished.load()); + t.join(); + + ASSERT_OK(dbfull()->SetOptions({ + {"disable_auto_compactions", "false"}, + })); + + int total_keys2 = 2000; + for (int i = total_keys; i < total_keys2; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 102))); + ASSERT_OK(Put(Key(kMaxKey + i), RandomString(&rnd, 102))); + ASSERT_OK(Delete(Key(i / 10))); + } + + verify_func(total_keys2, false); + dbfull()->TEST_WaitForCompact(); + verify_func(total_keys2, false); + + // Base level is not level 1 + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + ASSERT_EQ(NumTableFilesAtLevel(2), 0); +} +} // namespace ROCKSDB_NAMESPACE + +#endif // !defined(ROCKSDB_LITE) + +int main(int argc, char** argv) { +#if !defined(ROCKSDB_LITE) + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +#else + (void) argc; + (void) argv; + return 0; +#endif +} diff --git a/src/rocksdb/db/db_encryption_test.cc b/src/rocksdb/db/db_encryption_test.cc new file mode 100644 index 000000000..b1f3ce23f --- /dev/null +++ b/src/rocksdb/db/db_encryption_test.cc @@ -0,0 +1,122 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "rocksdb/perf_context.h" +#if !defined(ROCKSDB_LITE) +#include "test_util/sync_point.h" +#endif +#include +#include + +namespace ROCKSDB_NAMESPACE { + +class DBEncryptionTest : public DBTestBase { + public: + DBEncryptionTest() : DBTestBase("/db_encryption_test") {} +}; + +#ifndef ROCKSDB_LITE + +TEST_F(DBEncryptionTest, CheckEncrypted) { + ASSERT_OK(Put("foo567", "v1.fetdq")); + ASSERT_OK(Put("bar123", "v2.dfgkjdfghsd")); + Close(); + + // Open all files and look for the values we've put in there. + // They should not be found if encrypted, otherwise + // they should be found. + std::vector fileNames; + auto status = env_->GetChildren(dbname_, &fileNames); + ASSERT_OK(status); + + auto defaultEnv = Env::Default(); + int hits = 0; + for (auto it = fileNames.begin() ; it != fileNames.end(); ++it) { + if ((*it == "..") || (*it == ".")) { + continue; + } + auto filePath = dbname_ + "/" + *it; + std::unique_ptr seqFile; + auto envOptions = EnvOptions(CurrentOptions()); + status = defaultEnv->NewSequentialFile(filePath, &seqFile, envOptions); + ASSERT_OK(status); + + uint64_t fileSize; + status = defaultEnv->GetFileSize(filePath, &fileSize); + ASSERT_OK(status); + + std::string scratch; + scratch.reserve(fileSize); + Slice data; + status = seqFile->Read(fileSize, &data, (char*)scratch.data()); + ASSERT_OK(status); + + if (data.ToString().find("foo567") != std::string::npos) { + hits++; + //std::cout << "Hit in " << filePath << "\n"; + } + if (data.ToString().find("v1.fetdq") != std::string::npos) { + hits++; + //std::cout << "Hit in " << filePath << "\n"; + } + if (data.ToString().find("bar123") != std::string::npos) { + hits++; + //std::cout << "Hit in " << filePath << "\n"; + } + if (data.ToString().find("v2.dfgkjdfghsd") != std::string::npos) { + hits++; + //std::cout << "Hit in " << filePath << "\n"; + } + if (data.ToString().find("dfgk") != std::string::npos) { + hits++; + //std::cout << "Hit in " << filePath << "\n"; + } + } + if (encrypted_env_) { + ASSERT_EQ(hits, 0); + } else { + ASSERT_GE(hits, 4); + } +} + +TEST_F(DBEncryptionTest, ReadEmptyFile) { + auto defaultEnv = Env::Default(); + + // create empty file for reading it back in later + auto envOptions = EnvOptions(CurrentOptions()); + auto filePath = dbname_ + "/empty.empty"; + + Status status; + { + std::unique_ptr writableFile; + status = defaultEnv->NewWritableFile(filePath, &writableFile, envOptions); + ASSERT_OK(status); + } + + std::unique_ptr seqFile; + status = defaultEnv->NewSequentialFile(filePath, &seqFile, envOptions); + ASSERT_OK(status); + + std::string scratch; + Slice data; + // reading back 16 bytes from the empty file shouldn't trigger an assertion. + // it should just work and return an empty string + status = seqFile->Read(16, &data, (char*)scratch.data()); + ASSERT_OK(status); + + ASSERT_TRUE(data.empty()); +} + +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_filesnapshot.cc b/src/rocksdb/db/db_filesnapshot.cc new file mode 100644 index 000000000..f0f22cb47 --- /dev/null +++ b/src/rocksdb/db/db_filesnapshot.cc @@ -0,0 +1,177 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include +#include "db/db_impl/db_impl.h" +#include "db/job_context.h" +#include "db/version_set.h" +#include "file/file_util.h" +#include "file/filename.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "test_util/sync_point.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { + +Status DBImpl::DisableFileDeletions() { + InstrumentedMutexLock l(&mutex_); + ++disable_delete_obsolete_files_; + if (disable_delete_obsolete_files_ == 1) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Disabled"); + } else { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "File Deletions Disabled, but already disabled. Counter: %d", + disable_delete_obsolete_files_); + } + return Status::OK(); +} + +Status DBImpl::EnableFileDeletions(bool force) { + // Job id == 0 means that this is not our background process, but rather + // user thread + JobContext job_context(0); + bool file_deletion_enabled = false; + { + InstrumentedMutexLock l(&mutex_); + if (force) { + // if force, we need to enable file deletions right away + disable_delete_obsolete_files_ = 0; + } else if (disable_delete_obsolete_files_ > 0) { + --disable_delete_obsolete_files_; + } + if (disable_delete_obsolete_files_ == 0) { + file_deletion_enabled = true; + FindObsoleteFiles(&job_context, true); + bg_cv_.SignalAll(); + } + } + if (file_deletion_enabled) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Enabled"); + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); + } + } else { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "File Deletions Enable, but not really enabled. Counter: %d", + disable_delete_obsolete_files_); + } + job_context.Clean(); + LogFlush(immutable_db_options_.info_log); + return Status::OK(); +} + +int DBImpl::IsFileDeletionsEnabled() const { + return !disable_delete_obsolete_files_; +} + +Status DBImpl::GetLiveFiles(std::vector& ret, + uint64_t* manifest_file_size, + bool flush_memtable) { + *manifest_file_size = 0; + + mutex_.Lock(); + + if (flush_memtable) { + // flush all dirty data to disk. + Status status; + if (immutable_db_options_.atomic_flush) { + autovector cfds; + SelectColumnFamiliesForAtomicFlush(&cfds); + mutex_.Unlock(); + status = AtomicFlushMemTables(cfds, FlushOptions(), + FlushReason::kGetLiveFiles); + mutex_.Lock(); + } else { + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->IsDropped()) { + continue; + } + cfd->Ref(); + mutex_.Unlock(); + status = FlushMemTable(cfd, FlushOptions(), FlushReason::kGetLiveFiles); + TEST_SYNC_POINT("DBImpl::GetLiveFiles:1"); + TEST_SYNC_POINT("DBImpl::GetLiveFiles:2"); + mutex_.Lock(); + cfd->UnrefAndTryDelete(); + if (!status.ok()) { + break; + } + } + } + versions_->GetColumnFamilySet()->FreeDeadColumnFamilies(); + + if (!status.ok()) { + mutex_.Unlock(); + ROCKS_LOG_ERROR(immutable_db_options_.info_log, "Cannot Flush data %s\n", + status.ToString().c_str()); + return status; + } + } + + // Make a set of all of the live *.sst files + std::vector live; + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->IsDropped()) { + continue; + } + cfd->current()->AddLiveFiles(&live); + } + + ret.clear(); + ret.reserve(live.size() + 3); // *.sst + CURRENT + MANIFEST + OPTIONS + + // create names of the live files. The names are not absolute + // paths, instead they are relative to dbname_; + for (const auto& live_file : live) { + ret.push_back(MakeTableFileName("", live_file.GetNumber())); + } + + ret.push_back(CurrentFileName("")); + ret.push_back(DescriptorFileName("", versions_->manifest_file_number())); + ret.push_back(OptionsFileName("", versions_->options_file_number())); + + // find length of manifest file while holding the mutex lock + *manifest_file_size = versions_->manifest_file_size(); + + mutex_.Unlock(); + return Status::OK(); +} + +Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) { + { + // If caller disabled deletions, this function should return files that are + // guaranteed not to be deleted until deletions are re-enabled. We need to + // wait for pending purges to finish since WalManager doesn't know which + // files are going to be purged. Additional purges won't be scheduled as + // long as deletions are disabled (so the below loop must terminate). + InstrumentedMutexLock l(&mutex_); + while (disable_delete_obsolete_files_ > 0 && + pending_purge_obsolete_files_ > 0) { + bg_cv_.Wait(); + } + } + return wal_manager_.GetSortedWalFiles(files); +} + +Status DBImpl::GetCurrentWalFile(std::unique_ptr* current_log_file) { + uint64_t current_logfile_number; + { + InstrumentedMutexLock l(&mutex_); + current_logfile_number = logfile_number_; + } + + return wal_manager_.GetLiveWalFile(current_logfile_number, current_log_file); +} +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/db/db_flush_test.cc b/src/rocksdb/db/db_flush_test.cc new file mode 100644 index 000000000..bab206d3d --- /dev/null +++ b/src/rocksdb/db/db_flush_test.cc @@ -0,0 +1,784 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include + +#include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "test_util/fault_injection_test_env.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { + +class DBFlushTest : public DBTestBase { + public: + DBFlushTest() : DBTestBase("/db_flush_test") {} +}; + +class DBFlushDirectIOTest : public DBFlushTest, + public ::testing::WithParamInterface { + public: + DBFlushDirectIOTest() : DBFlushTest() {} +}; + +class DBAtomicFlushTest : public DBFlushTest, + public ::testing::WithParamInterface { + public: + DBAtomicFlushTest() : DBFlushTest() {} +}; + +// We had issue when two background threads trying to flush at the same time, +// only one of them get committed. The test verifies the issue is fixed. +TEST_F(DBFlushTest, FlushWhileWritingManifest) { + Options options; + options.disable_auto_compactions = true; + options.max_background_flushes = 2; + options.env = env_; + Reopen(options); + FlushOptions no_wait; + no_wait.wait = false; + no_wait.allow_write_stall=true; + + SyncPoint::GetInstance()->LoadDependency( + {{"VersionSet::LogAndApply:WriteManifest", + "DBFlushTest::FlushWhileWritingManifest:1"}, + {"MemTableList::TryInstallMemtableFlushResults:InProgress", + "VersionSet::LogAndApply:WriteManifestDone"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put("foo", "v")); + ASSERT_OK(dbfull()->Flush(no_wait)); + TEST_SYNC_POINT("DBFlushTest::FlushWhileWritingManifest:1"); + ASSERT_OK(Put("bar", "v")); + ASSERT_OK(dbfull()->Flush(no_wait)); + // If the issue is hit we will wait here forever. + dbfull()->TEST_WaitForFlushMemTable(); +#ifndef ROCKSDB_LITE + ASSERT_EQ(2, TotalTableFiles()); +#endif // ROCKSDB_LITE +} + +// Disable this test temporarily on Travis as it fails intermittently. +// Github issue: #4151 +TEST_F(DBFlushTest, SyncFail) { + std::unique_ptr fault_injection_env( + new FaultInjectionTestEnv(env_)); + Options options; + options.disable_auto_compactions = true; + options.env = fault_injection_env.get(); + + SyncPoint::GetInstance()->LoadDependency( + {{"DBFlushTest::SyncFail:GetVersionRefCount:1", + "DBImpl::FlushMemTableToOutputFile:BeforePickMemtables"}, + {"DBImpl::FlushMemTableToOutputFile:AfterPickMemtables", + "DBFlushTest::SyncFail:GetVersionRefCount:2"}, + {"DBFlushTest::SyncFail:1", "DBImpl::SyncClosedLogs:Start"}, + {"DBImpl::SyncClosedLogs:Failed", "DBFlushTest::SyncFail:2"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + CreateAndReopenWithCF({"pikachu"}, options); + Put("key", "value"); + auto* cfd = + reinterpret_cast(db_->DefaultColumnFamily()) + ->cfd(); + FlushOptions flush_options; + flush_options.wait = false; + ASSERT_OK(dbfull()->Flush(flush_options)); + // Flush installs a new super-version. Get the ref count after that. + auto current_before = cfd->current(); + int refs_before = cfd->current()->TEST_refs(); + TEST_SYNC_POINT("DBFlushTest::SyncFail:GetVersionRefCount:1"); + TEST_SYNC_POINT("DBFlushTest::SyncFail:GetVersionRefCount:2"); + int refs_after_picking_memtables = cfd->current()->TEST_refs(); + ASSERT_EQ(refs_before + 1, refs_after_picking_memtables); + fault_injection_env->SetFilesystemActive(false); + TEST_SYNC_POINT("DBFlushTest::SyncFail:1"); + TEST_SYNC_POINT("DBFlushTest::SyncFail:2"); + fault_injection_env->SetFilesystemActive(true); + // Now the background job will do the flush; wait for it. + dbfull()->TEST_WaitForFlushMemTable(); +#ifndef ROCKSDB_LITE + ASSERT_EQ("", FilesPerLevel()); // flush failed. +#endif // ROCKSDB_LITE + // Backgroun flush job should release ref count to current version. + ASSERT_EQ(current_before, cfd->current()); + ASSERT_EQ(refs_before, cfd->current()->TEST_refs()); + Destroy(options); +} + +TEST_F(DBFlushTest, SyncSkip) { + Options options = CurrentOptions(); + + SyncPoint::GetInstance()->LoadDependency( + {{"DBFlushTest::SyncSkip:1", "DBImpl::SyncClosedLogs:Skip"}, + {"DBImpl::SyncClosedLogs:Skip", "DBFlushTest::SyncSkip:2"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + Reopen(options); + Put("key", "value"); + + FlushOptions flush_options; + flush_options.wait = false; + ASSERT_OK(dbfull()->Flush(flush_options)); + + TEST_SYNC_POINT("DBFlushTest::SyncSkip:1"); + TEST_SYNC_POINT("DBFlushTest::SyncSkip:2"); + + // Now the background job will do the flush; wait for it. + dbfull()->TEST_WaitForFlushMemTable(); + + Destroy(options); +} + +TEST_F(DBFlushTest, FlushInLowPriThreadPool) { + // Verify setting an empty high-pri (flush) thread pool causes flushes to be + // scheduled in the low-pri (compaction) thread pool. + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 4; + options.memtable_factory.reset(new SpecialSkipListFactory(1)); + Reopen(options); + env_->SetBackgroundThreads(0, Env::HIGH); + + std::thread::id tid; + int num_flushes = 0, num_compactions = 0; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BGWorkFlush", [&](void* /*arg*/) { + if (tid == std::thread::id()) { + tid = std::this_thread::get_id(); + } else { + ASSERT_EQ(tid, std::this_thread::get_id()); + } + ++num_flushes; + }); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BGWorkCompaction", [&](void* /*arg*/) { + ASSERT_EQ(tid, std::this_thread::get_id()); + ++num_compactions; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put("key", "val")); + for (int i = 0; i < 4; ++i) { + ASSERT_OK(Put("key", "val")); + dbfull()->TEST_WaitForFlushMemTable(); + } + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(4, num_flushes); + ASSERT_EQ(1, num_compactions); +} + +TEST_F(DBFlushTest, ManualFlushWithMinWriteBufferNumberToMerge) { + Options options = CurrentOptions(); + options.write_buffer_size = 100; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 3; + Reopen(options); + + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BGWorkFlush", + "DBFlushTest::ManualFlushWithMinWriteBufferNumberToMerge:1"}, + {"DBFlushTest::ManualFlushWithMinWriteBufferNumberToMerge:2", + "FlushJob::WriteLevel0Table"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put("key1", "value1")); + + port::Thread t([&]() { + // The call wait for flush to finish, i.e. with flush_options.wait = true. + ASSERT_OK(Flush()); + }); + + // Wait for flush start. + TEST_SYNC_POINT("DBFlushTest::ManualFlushWithMinWriteBufferNumberToMerge:1"); + // Insert a second memtable before the manual flush finish. + // At the end of the manual flush job, it will check if further flush + // is needed, but it will not trigger flush of the second memtable because + // min_write_buffer_number_to_merge is not reached. + ASSERT_OK(Put("key2", "value2")); + ASSERT_OK(dbfull()->TEST_SwitchMemtable()); + TEST_SYNC_POINT("DBFlushTest::ManualFlushWithMinWriteBufferNumberToMerge:2"); + + // Manual flush should return, without waiting for flush indefinitely. + t.join(); +} + +TEST_F(DBFlushTest, ScheduleOnlyOneBgThread) { + Options options = CurrentOptions(); + Reopen(options); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + int called = 0; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::MaybeScheduleFlushOrCompaction:AfterSchedule:0", [&](void* arg) { + ASSERT_NE(nullptr, arg); + auto unscheduled_flushes = *reinterpret_cast(arg); + ASSERT_EQ(0, unscheduled_flushes); + ++called; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put("a", "foo")); + FlushOptions flush_opts; + ASSERT_OK(dbfull()->Flush(flush_opts)); + ASSERT_EQ(1, called); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(DBFlushDirectIOTest, DirectIO) { + Options options; + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.max_background_flushes = 2; + options.use_direct_io_for_flush_and_compaction = GetParam(); + options.env = new MockEnv(Env::Default()); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:create_file", [&](void* arg) { + bool* use_direct_writes = static_cast(arg); + ASSERT_EQ(*use_direct_writes, + options.use_direct_io_for_flush_and_compaction); + }); + + SyncPoint::GetInstance()->EnableProcessing(); + Reopen(options); + ASSERT_OK(Put("foo", "v")); + FlushOptions flush_options; + flush_options.wait = true; + ASSERT_OK(dbfull()->Flush(flush_options)); + Destroy(options); + delete options.env; +} + +TEST_F(DBFlushTest, FlushError) { + Options options; + std::unique_ptr fault_injection_env( + new FaultInjectionTestEnv(env_)); + options.write_buffer_size = 100; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 3; + options.disable_auto_compactions = true; + options.env = fault_injection_env.get(); + Reopen(options); + + ASSERT_OK(Put("key1", "value1")); + ASSERT_OK(Put("key2", "value2")); + fault_injection_env->SetFilesystemActive(false); + Status s = dbfull()->TEST_SwitchMemtable(); + fault_injection_env->SetFilesystemActive(true); + Destroy(options); + ASSERT_NE(s, Status::OK()); +} + +TEST_F(DBFlushTest, ManualFlushFailsInReadOnlyMode) { + // Regression test for bug where manual flush hangs forever when the DB + // is in read-only mode. Verify it now at least returns, despite failing. + Options options; + std::unique_ptr fault_injection_env( + new FaultInjectionTestEnv(env_)); + options.env = fault_injection_env.get(); + options.max_write_buffer_number = 2; + Reopen(options); + + // Trigger a first flush but don't let it run + ASSERT_OK(db_->PauseBackgroundWork()); + ASSERT_OK(Put("key1", "value1")); + FlushOptions flush_opts; + flush_opts.wait = false; + ASSERT_OK(db_->Flush(flush_opts)); + + // Write a key to the second memtable so we have something to flush later + // after the DB is in read-only mode. + ASSERT_OK(Put("key2", "value2")); + + // Let the first flush continue, hit an error, and put the DB in read-only + // mode. + fault_injection_env->SetFilesystemActive(false); + ASSERT_OK(db_->ContinueBackgroundWork()); + dbfull()->TEST_WaitForFlushMemTable(); +#ifndef ROCKSDB_LITE + uint64_t num_bg_errors; + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBackgroundErrors, + &num_bg_errors)); + ASSERT_GT(num_bg_errors, 0); +#endif // ROCKSDB_LITE + + // In the bug scenario, triggering another flush would cause the second flush + // to hang forever. After the fix we expect it to return an error. + ASSERT_NOK(db_->Flush(FlushOptions())); + + Close(); +} + +TEST_F(DBFlushTest, CFDropRaceWithWaitForFlushMemTables) { + Options options = CurrentOptions(); + options.create_if_missing = true; + CreateAndReopenWithCF({"pikachu"}, options); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:AfterScheduleFlush", + "DBFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop"}, + {"DBFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree", + "DBImpl::BackgroundCallFlush:start"}, + {"DBImpl::BackgroundCallFlush:start", + "DBImpl::FlushMemTable:BeforeWaitForBgFlush"}}); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_EQ(2, handles_.size()); + ASSERT_OK(Put(1, "key", "value")); + auto* cfd = static_cast(handles_[1])->cfd(); + port::Thread drop_cf_thr([&]() { + TEST_SYNC_POINT( + "DBFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop"); + ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); + ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1])); + handles_.resize(1); + TEST_SYNC_POINT( + "DBFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree"); + }); + FlushOptions flush_opts; + flush_opts.allow_write_stall = true; + ASSERT_NOK(dbfull()->TEST_FlushMemTable(cfd, flush_opts)); + drop_cf_thr.join(); + Close(); + SyncPoint::GetInstance()->DisableProcessing(); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBFlushTest, FireOnFlushCompletedAfterCommittedResult) { + class TestListener : public EventListener { + public: + void OnFlushCompleted(DB* db, const FlushJobInfo& info) override { + // There's only one key in each flush. + ASSERT_EQ(info.smallest_seqno, info.largest_seqno); + ASSERT_NE(0, info.smallest_seqno); + if (info.smallest_seqno == seq1) { + // First flush completed + ASSERT_FALSE(completed1); + completed1 = true; + CheckFlushResultCommitted(db, seq1); + } else { + // Second flush completed + ASSERT_FALSE(completed2); + completed2 = true; + ASSERT_EQ(info.smallest_seqno, seq2); + CheckFlushResultCommitted(db, seq2); + } + } + + void CheckFlushResultCommitted(DB* db, SequenceNumber seq) { + DBImpl* db_impl = static_cast_with_check(db); + InstrumentedMutex* mutex = db_impl->mutex(); + mutex->Lock(); + auto* cfd = + reinterpret_cast(db->DefaultColumnFamily()) + ->cfd(); + ASSERT_LT(seq, cfd->imm()->current()->GetEarliestSequenceNumber()); + mutex->Unlock(); + } + + std::atomic seq1{0}; + std::atomic seq2{0}; + std::atomic completed1{false}; + std::atomic completed2{false}; + }; + std::shared_ptr listener = std::make_shared(); + + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BackgroundCallFlush:start", + "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:WaitFirst"}, + {"DBImpl::FlushMemTableToOutputFile:Finish", + "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:WaitSecond"}}); + SyncPoint::GetInstance()->SetCallBack( + "FlushJob::WriteLevel0Table", [&listener](void* arg) { + // Wait for the second flush finished, out of mutex. + auto* mems = reinterpret_cast*>(arg); + if (mems->front()->GetEarliestSequenceNumber() == listener->seq1 - 1) { + TEST_SYNC_POINT( + "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:" + "WaitSecond"); + } + }); + + Options options = CurrentOptions(); + options.create_if_missing = true; + options.listeners.push_back(listener); + // Setting max_flush_jobs = max_background_jobs / 4 = 2. + options.max_background_jobs = 8; + // Allow 2 immutable memtables. + options.max_write_buffer_number = 3; + Reopen(options); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put("foo", "v")); + listener->seq1 = db_->GetLatestSequenceNumber(); + // t1 will wait for the second flush complete before committing flush result. + auto t1 = port::Thread([&]() { + // flush_opts.wait = true + ASSERT_OK(db_->Flush(FlushOptions())); + }); + // Wait for first flush started. + TEST_SYNC_POINT( + "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:WaitFirst"); + // The second flush will exit early without commit its result. The work + // is delegated to the first flush. + ASSERT_OK(Put("bar", "v")); + listener->seq2 = db_->GetLatestSequenceNumber(); + FlushOptions flush_opts; + flush_opts.wait = false; + ASSERT_OK(db_->Flush(flush_opts)); + t1.join(); + ASSERT_TRUE(listener->completed1); + ASSERT_TRUE(listener->completed2); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} +#endif // !ROCKSDB_LITE + +TEST_P(DBAtomicFlushTest, ManualAtomicFlush) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.atomic_flush = GetParam(); + options.write_buffer_size = (static_cast(64) << 20); + + CreateAndReopenWithCF({"pikachu", "eevee"}, options); + size_t num_cfs = handles_.size(); + ASSERT_EQ(3, num_cfs); + WriteOptions wopts; + wopts.disableWAL = true; + for (size_t i = 0; i != num_cfs; ++i) { + ASSERT_OK(Put(static_cast(i) /*cf*/, "key", "value", wopts)); + } + std::vector cf_ids; + for (size_t i = 0; i != num_cfs; ++i) { + cf_ids.emplace_back(static_cast(i)); + } + ASSERT_OK(Flush(cf_ids)); + for (size_t i = 0; i != num_cfs; ++i) { + auto cfh = static_cast(handles_[i]); + ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed()); + ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty()); + } +} + +TEST_P(DBAtomicFlushTest, AtomicFlushTriggeredByMemTableFull) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.atomic_flush = GetParam(); + // 4KB so that we can easily trigger auto flush. + options.write_buffer_size = 4096; + + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BackgroundCallFlush:FlushFinish:0", + "DBAtomicFlushTest::AtomicFlushTriggeredByMemTableFull:BeforeCheck"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + CreateAndReopenWithCF({"pikachu", "eevee"}, options); + size_t num_cfs = handles_.size(); + ASSERT_EQ(3, num_cfs); + WriteOptions wopts; + wopts.disableWAL = true; + for (size_t i = 0; i != num_cfs; ++i) { + ASSERT_OK(Put(static_cast(i) /*cf*/, "key", "value", wopts)); + } + // Keep writing to one of them column families to trigger auto flush. + for (int i = 0; i != 4000; ++i) { + ASSERT_OK(Put(static_cast(num_cfs) - 1 /*cf*/, + "key" + std::to_string(i), "value" + std::to_string(i), + wopts)); + } + + TEST_SYNC_POINT( + "DBAtomicFlushTest::AtomicFlushTriggeredByMemTableFull:BeforeCheck"); + if (options.atomic_flush) { + for (size_t i = 0; i != num_cfs - 1; ++i) { + auto cfh = static_cast(handles_[i]); + ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed()); + ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty()); + } + } else { + for (size_t i = 0; i != num_cfs - 1; ++i) { + auto cfh = static_cast(handles_[i]); + ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed()); + ASSERT_FALSE(cfh->cfd()->mem()->IsEmpty()); + } + } + SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_P(DBAtomicFlushTest, AtomicFlushRollbackSomeJobs) { + bool atomic_flush = GetParam(); + if (!atomic_flush) { + return; + } + std::unique_ptr fault_injection_env( + new FaultInjectionTestEnv(env_)); + Options options = CurrentOptions(); + options.create_if_missing = true; + options.atomic_flush = atomic_flush; + options.env = fault_injection_env.get(); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:1", + "DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:1"}, + {"DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:2", + "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:2"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + CreateAndReopenWithCF({"pikachu", "eevee"}, options); + size_t num_cfs = handles_.size(); + ASSERT_EQ(3, num_cfs); + WriteOptions wopts; + wopts.disableWAL = true; + for (size_t i = 0; i != num_cfs; ++i) { + int cf_id = static_cast(i); + ASSERT_OK(Put(cf_id, "key", "value", wopts)); + } + FlushOptions flush_opts; + flush_opts.wait = false; + ASSERT_OK(dbfull()->Flush(flush_opts, handles_)); + TEST_SYNC_POINT("DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:1"); + fault_injection_env->SetFilesystemActive(false); + TEST_SYNC_POINT("DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:2"); + for (auto* cfh : handles_) { + dbfull()->TEST_WaitForFlushMemTable(cfh); + } + for (size_t i = 0; i != num_cfs; ++i) { + auto cfh = static_cast(handles_[i]); + ASSERT_EQ(1, cfh->cfd()->imm()->NumNotFlushed()); + ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty()); + } + fault_injection_env->SetFilesystemActive(true); + Destroy(options); +} + +TEST_P(DBAtomicFlushTest, FlushMultipleCFs_DropSomeBeforeRequestFlush) { + bool atomic_flush = GetParam(); + if (!atomic_flush) { + return; + } + Options options = CurrentOptions(); + options.create_if_missing = true; + options.atomic_flush = atomic_flush; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->EnableProcessing(); + + CreateAndReopenWithCF({"pikachu", "eevee"}, options); + size_t num_cfs = handles_.size(); + ASSERT_EQ(3, num_cfs); + WriteOptions wopts; + wopts.disableWAL = true; + std::vector cf_ids; + for (size_t i = 0; i != num_cfs; ++i) { + int cf_id = static_cast(i); + ASSERT_OK(Put(cf_id, "key", "value", wopts)); + cf_ids.push_back(cf_id); + } + ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); + ASSERT_TRUE(Flush(cf_ids).IsColumnFamilyDropped()); + Destroy(options); +} + +TEST_P(DBAtomicFlushTest, + FlushMultipleCFs_DropSomeAfterScheduleFlushBeforeFlushJobRun) { + bool atomic_flush = GetParam(); + if (!atomic_flush) { + return; + } + Options options = CurrentOptions(); + options.create_if_missing = true; + options.atomic_flush = atomic_flush; + + CreateAndReopenWithCF({"pikachu", "eevee"}, options); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::AtomicFlushMemTables:AfterScheduleFlush", + "DBAtomicFlushTest::BeforeDropCF"}, + {"DBAtomicFlushTest::AfterDropCF", + "DBImpl::BackgroundCallFlush:start"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + size_t num_cfs = handles_.size(); + ASSERT_EQ(3, num_cfs); + WriteOptions wopts; + wopts.disableWAL = true; + for (size_t i = 0; i != num_cfs; ++i) { + int cf_id = static_cast(i); + ASSERT_OK(Put(cf_id, "key", "value", wopts)); + } + port::Thread user_thread([&]() { + TEST_SYNC_POINT("DBAtomicFlushTest::BeforeDropCF"); + ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); + TEST_SYNC_POINT("DBAtomicFlushTest::AfterDropCF"); + }); + FlushOptions flush_opts; + flush_opts.wait = true; + ASSERT_OK(dbfull()->Flush(flush_opts, handles_)); + user_thread.join(); + for (size_t i = 0; i != num_cfs; ++i) { + int cf_id = static_cast(i); + ASSERT_EQ("value", Get(cf_id, "key")); + } + + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "eevee"}, options); + num_cfs = handles_.size(); + ASSERT_EQ(2, num_cfs); + for (size_t i = 0; i != num_cfs; ++i) { + int cf_id = static_cast(i); + ASSERT_EQ("value", Get(cf_id, "key")); + } + Destroy(options); +} + +TEST_P(DBAtomicFlushTest, TriggerFlushAndClose) { + bool atomic_flush = GetParam(); + if (!atomic_flush) { + return; + } + const int kNumKeysTriggerFlush = 4; + Options options = CurrentOptions(); + options.create_if_missing = true; + options.atomic_flush = atomic_flush; + options.memtable_factory.reset( + new SpecialSkipListFactory(kNumKeysTriggerFlush)); + CreateAndReopenWithCF({"pikachu"}, options); + + for (int i = 0; i != kNumKeysTriggerFlush; ++i) { + ASSERT_OK(Put(0, "key" + std::to_string(i), "value" + std::to_string(i))); + } + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put(0, "key", "value")); + Close(); + + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options); + ASSERT_EQ("value", Get(0, "key")); +} + +TEST_P(DBAtomicFlushTest, PickMemtablesRaceWithBackgroundFlush) { + bool atomic_flush = GetParam(); + Options options = CurrentOptions(); + options.create_if_missing = true; + options.atomic_flush = atomic_flush; + options.max_write_buffer_number = 4; + // Set min_write_buffer_number_to_merge to be greater than 1, so that + // a column family with one memtable in the imm will not cause IsFlushPending + // to return true when flush_requested_ is false. + options.min_write_buffer_number_to_merge = 2; + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_EQ(2, handles_.size()); + ASSERT_OK(dbfull()->PauseBackgroundWork()); + ASSERT_OK(Put(0, "key00", "value00")); + ASSERT_OK(Put(1, "key10", "value10")); + FlushOptions flush_opts; + flush_opts.wait = false; + ASSERT_OK(dbfull()->Flush(flush_opts, handles_)); + ASSERT_OK(Put(0, "key01", "value01")); + // Since max_write_buffer_number is 4, the following flush won't cause write + // stall. + ASSERT_OK(dbfull()->Flush(flush_opts)); + ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); + ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1])); + handles_[1] = nullptr; + ASSERT_OK(dbfull()->ContinueBackgroundWork()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0])); + delete handles_[0]; + handles_.clear(); +} + +TEST_P(DBAtomicFlushTest, CFDropRaceWithWaitForFlushMemTables) { + bool atomic_flush = GetParam(); + if (!atomic_flush) { + return; + } + Options options = CurrentOptions(); + options.create_if_missing = true; + options.atomic_flush = atomic_flush; + CreateAndReopenWithCF({"pikachu"}, options); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::AtomicFlushMemTables:AfterScheduleFlush", + "DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop"}, + {"DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree", + "DBImpl::BackgroundCallFlush:start"}, + {"DBImpl::BackgroundCallFlush:start", + "DBImpl::AtomicFlushMemTables:BeforeWaitForBgFlush"}}); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_EQ(2, handles_.size()); + ASSERT_OK(Put(0, "key", "value")); + ASSERT_OK(Put(1, "key", "value")); + auto* cfd_default = + static_cast(dbfull()->DefaultColumnFamily()) + ->cfd(); + auto* cfd_pikachu = static_cast(handles_[1])->cfd(); + port::Thread drop_cf_thr([&]() { + TEST_SYNC_POINT( + "DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop"); + ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); + delete handles_[1]; + handles_.resize(1); + TEST_SYNC_POINT( + "DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree"); + }); + FlushOptions flush_opts; + flush_opts.allow_write_stall = true; + ASSERT_OK(dbfull()->TEST_AtomicFlushMemTables({cfd_default, cfd_pikachu}, + flush_opts)); + drop_cf_thr.join(); + Close(); + SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_P(DBAtomicFlushTest, RollbackAfterFailToInstallResults) { + bool atomic_flush = GetParam(); + if (!atomic_flush) { + return; + } + auto fault_injection_env = std::make_shared(env_); + Options options = CurrentOptions(); + options.env = fault_injection_env.get(); + options.create_if_missing = true; + options.atomic_flush = atomic_flush; + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_EQ(2, handles_.size()); + for (size_t cf = 0; cf < handles_.size(); ++cf) { + ASSERT_OK(Put(static_cast(cf), "a", "value")); + } + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0", + [&](void* /*arg*/) { fault_injection_env->SetFilesystemActive(false); }); + SyncPoint::GetInstance()->EnableProcessing(); + FlushOptions flush_opts; + Status s = db_->Flush(flush_opts, handles_); + ASSERT_NOK(s); + fault_injection_env->SetFilesystemActive(true); + Close(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +INSTANTIATE_TEST_CASE_P(DBFlushDirectIOTest, DBFlushDirectIOTest, + testing::Bool()); + +INSTANTIATE_TEST_CASE_P(DBAtomicFlushTest, DBAtomicFlushTest, testing::Bool()); + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_impl/db_impl.cc b/src/rocksdb/db/db_impl/db_impl.cc new file mode 100644 index 000000000..d7880fc1a --- /dev/null +++ b/src/rocksdb/db/db_impl/db_impl.cc @@ -0,0 +1,4550 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "db/db_impl/db_impl.h" + +#include +#ifdef OS_SOLARIS +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/arena_wrapped_db_iter.h" +#include "db/builder.h" +#include "db/compaction/compaction_job.h" +#include "db/db_info_dumper.h" +#include "db/db_iter.h" +#include "db/dbformat.h" +#include "db/error_handler.h" +#include "db/event_helpers.h" +#include "db/external_sst_file_ingestion_job.h" +#include "db/flush_job.h" +#include "db/forward_iterator.h" +#include "db/import_column_family_job.h" +#include "db/job_context.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/malloc_stats.h" +#include "db/memtable.h" +#include "db/memtable_list.h" +#include "db/merge_context.h" +#include "db/merge_helper.h" +#include "db/range_tombstone_fragmenter.h" +#include "db/table_cache.h" +#include "db/table_properties_collector.h" +#include "db/transaction_log_impl.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "db/write_callback.h" +#include "env/composite_env_wrapper.h" +#include "file/file_util.h" +#include "file/filename.h" +#include "file/random_access_file_reader.h" +#include "file/sst_file_manager_impl.h" +#include "logging/auto_roll_logger.h" +#include "logging/log_buffer.h" +#include "logging/logging.h" +#include "memtable/hash_linklist_rep.h" +#include "memtable/hash_skiplist_rep.h" +#include "monitoring/in_memory_stats_history.h" +#include "monitoring/iostats_context_imp.h" +#include "monitoring/perf_context_imp.h" +#include "monitoring/persistent_stats_history.h" +#include "monitoring/thread_status_updater.h" +#include "monitoring/thread_status_util.h" +#include "options/cf_options.h" +#include "options/options_helper.h" +#include "options/options_parser.h" +#include "port/port.h" +#include "rocksdb/cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/convenience.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/statistics.h" +#include "rocksdb/stats_history.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "rocksdb/write_buffer_manager.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/get_context.h" +#include "table/merging_iterator.h" +#include "table/multiget_context.h" +#include "table/table_builder.h" +#include "table/two_level_iterator.h" +#include "test_util/sync_point.h" +#include "tools/sst_dump_tool_imp.h" +#include "util/autovector.h" +#include "util/build_version.h" +#include "util/cast_util.h" +#include "util/coding.h" +#include "util/compression.h" +#include "util/crc32c.h" +#include "util/mutexlock.h" +#include "util/stop_watch.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +const std::string kDefaultColumnFamilyName("default"); +const std::string kPersistentStatsColumnFamilyName( + "___rocksdb_stats_history___"); +void DumpRocksDBBuildVersion(Logger* log); + +CompressionType GetCompressionFlush( + const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options) { + // Compressing memtable flushes might not help unless the sequential load + // optimization is used for leveled compaction. Otherwise the CPU and + // latency overhead is not offset by saving much space. + if (ioptions.compaction_style == kCompactionStyleUniversal) { + if (mutable_cf_options.compaction_options_universal + .compression_size_percent < 0) { + return mutable_cf_options.compression; + } else { + return kNoCompression; + } + } else if (!ioptions.compression_per_level.empty()) { + // For leveled compress when min_level_to_compress != 0. + return ioptions.compression_per_level[0]; + } else { + return mutable_cf_options.compression; + } +} + +namespace { +void DumpSupportInfo(Logger* logger) { + ROCKS_LOG_HEADER(logger, "Compression algorithms supported:"); + for (auto& compression : OptionsHelper::compression_type_string_map) { + if (compression.second != kNoCompression && + compression.second != kDisableCompressionOption) { + ROCKS_LOG_HEADER(logger, "\t%s supported: %d", compression.first.c_str(), + CompressionTypeSupported(compression.second)); + } + } + ROCKS_LOG_HEADER(logger, "Fast CRC32 supported: %s", + crc32c::IsFastCrc32Supported().c_str()); +} +} // namespace + +DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, + const bool seq_per_batch, const bool batch_per_txn) + : dbname_(dbname), + own_info_log_(options.info_log == nullptr), + initial_db_options_(SanitizeOptions(dbname, options)), + env_(initial_db_options_.env), + fs_(initial_db_options_.file_system), + immutable_db_options_(initial_db_options_), + mutable_db_options_(initial_db_options_), + stats_(immutable_db_options_.statistics.get()), + mutex_(stats_, env_, DB_MUTEX_WAIT_MICROS, + immutable_db_options_.use_adaptive_mutex), + default_cf_handle_(nullptr), + max_total_in_memory_state_(0), + file_options_(BuildDBOptions(immutable_db_options_, mutable_db_options_)), + file_options_for_compaction_(fs_->OptimizeForCompactionTableWrite( + file_options_, immutable_db_options_)), + seq_per_batch_(seq_per_batch), + batch_per_txn_(batch_per_txn), + db_lock_(nullptr), + shutting_down_(false), + manual_compaction_paused_(false), + bg_cv_(&mutex_), + logfile_number_(0), + log_dir_synced_(false), + log_empty_(true), + persist_stats_cf_handle_(nullptr), + log_sync_cv_(&mutex_), + total_log_size_(0), + is_snapshot_supported_(true), + write_buffer_manager_(immutable_db_options_.write_buffer_manager.get()), + write_thread_(immutable_db_options_), + nonmem_write_thread_(immutable_db_options_), + write_controller_(mutable_db_options_.delayed_write_rate), + last_batch_group_size_(0), + unscheduled_flushes_(0), + unscheduled_compactions_(0), + bg_bottom_compaction_scheduled_(0), + bg_compaction_scheduled_(0), + num_running_compactions_(0), + bg_flush_scheduled_(0), + num_running_flushes_(0), + bg_purge_scheduled_(0), + disable_delete_obsolete_files_(0), + pending_purge_obsolete_files_(0), + delete_obsolete_files_last_run_(env_->NowMicros()), + last_stats_dump_time_microsec_(0), + next_job_id_(1), + has_unpersisted_data_(false), + unable_to_release_oldest_log_(false), + num_running_ingest_file_(0), +#ifndef ROCKSDB_LITE + wal_manager_(immutable_db_options_, file_options_, seq_per_batch), +#endif // ROCKSDB_LITE + event_logger_(immutable_db_options_.info_log.get()), + bg_work_paused_(0), + bg_compaction_paused_(0), + refitting_level_(false), + opened_successfully_(false), + two_write_queues_(options.two_write_queues), + manual_wal_flush_(options.manual_wal_flush), + // last_sequencee_ is always maintained by the main queue that also writes + // to the memtable. When two_write_queues_ is disabled last seq in + // memtable is the same as last seq published to the readers. When it is + // enabled but seq_per_batch_ is disabled, last seq in memtable still + // indicates last published seq since wal-only writes that go to the 2nd + // queue do not consume a sequence number. Otherwise writes performed by + // the 2nd queue could change what is visible to the readers. In this + // cases, last_seq_same_as_publish_seq_==false, the 2nd queue maintains a + // separate variable to indicate the last published sequence. + last_seq_same_as_publish_seq_( + !(seq_per_batch && options.two_write_queues)), + // Since seq_per_batch_ is currently set only by WritePreparedTxn which + // requires a custom gc for compaction, we use that to set use_custom_gc_ + // as well. + use_custom_gc_(seq_per_batch), + shutdown_initiated_(false), + own_sfm_(options.sst_file_manager == nullptr), + preserve_deletes_(options.preserve_deletes), + closed_(false), + error_handler_(this, immutable_db_options_, &mutex_), + atomic_flush_install_cv_(&mutex_) { + // !batch_per_trx_ implies seq_per_batch_ because it is only unset for + // WriteUnprepared, which should use seq_per_batch_. + assert(batch_per_txn_ || seq_per_batch_); + env_->GetAbsolutePath(dbname, &db_absolute_path_); + + // Reserve ten files or so for other uses and give the rest to TableCache. + // Give a large number for setting of "infinite" open files. + const int table_cache_size = (mutable_db_options_.max_open_files == -1) + ? TableCache::kInfiniteCapacity + : mutable_db_options_.max_open_files - 10; + LRUCacheOptions co; + co.capacity = table_cache_size; + co.num_shard_bits = immutable_db_options_.table_cache_numshardbits; + co.metadata_charge_policy = kDontChargeCacheMetadata; + table_cache_ = NewLRUCache(co); + + versions_.reset(new VersionSet(dbname_, &immutable_db_options_, file_options_, + table_cache_.get(), write_buffer_manager_, + &write_controller_, &block_cache_tracer_)); + column_family_memtables_.reset( + new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet())); + + DumpRocksDBBuildVersion(immutable_db_options_.info_log.get()); + DumpDBFileSummary(immutable_db_options_, dbname_); + immutable_db_options_.Dump(immutable_db_options_.info_log.get()); + mutable_db_options_.Dump(immutable_db_options_.info_log.get()); + DumpSupportInfo(immutable_db_options_.info_log.get()); + + // always open the DB with 0 here, which means if preserve_deletes_==true + // we won't drop any deletion markers until SetPreserveDeletesSequenceNumber() + // is called by client and this seqnum is advanced. + preserve_deletes_seqnum_.store(0); +} + +Status DBImpl::Resume() { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "Resuming DB"); + + InstrumentedMutexLock db_mutex(&mutex_); + + if (!error_handler_.IsDBStopped() && !error_handler_.IsBGWorkStopped()) { + // Nothing to do + return Status::OK(); + } + + if (error_handler_.IsRecoveryInProgress()) { + // Don't allow a mix of manual and automatic recovery + return Status::Busy(); + } + + mutex_.Unlock(); + Status s = error_handler_.RecoverFromBGError(true); + mutex_.Lock(); + return s; +} + +// This function implements the guts of recovery from a background error. It +// is eventually called for both manual as well as automatic recovery. It does +// the following - +// 1. Wait for currently scheduled background flush/compaction to exit, in +// order to inadvertently causing an error and thinking recovery failed +// 2. Flush memtables if there's any data for all the CFs. This may result +// another error, which will be saved by error_handler_ and reported later +// as the recovery status +// 3. Find and delete any obsolete files +// 4. Schedule compactions if needed for all the CFs. This is needed as the +// flush in the prior step might have been a no-op for some CFs, which +// means a new super version wouldn't have been installed +Status DBImpl::ResumeImpl() { + mutex_.AssertHeld(); + WaitForBackgroundWork(); + + Status bg_error = error_handler_.GetBGError(); + Status s; + if (shutdown_initiated_) { + // Returning shutdown status to SFM during auto recovery will cause it + // to abort the recovery and allow the shutdown to progress + s = Status::ShutdownInProgress(); + } + if (s.ok() && bg_error.severity() > Status::Severity::kHardError) { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "DB resume requested but failed due to Fatal/Unrecoverable error"); + s = bg_error; + } + + // We cannot guarantee consistency of the WAL. So force flush Memtables of + // all the column families + if (s.ok()) { + FlushOptions flush_opts; + // We allow flush to stall write since we are trying to resume from error. + flush_opts.allow_write_stall = true; + if (immutable_db_options_.atomic_flush) { + autovector cfds; + SelectColumnFamiliesForAtomicFlush(&cfds); + mutex_.Unlock(); + s = AtomicFlushMemTables(cfds, flush_opts, FlushReason::kErrorRecovery); + mutex_.Lock(); + } else { + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->IsDropped()) { + continue; + } + cfd->Ref(); + mutex_.Unlock(); + s = FlushMemTable(cfd, flush_opts, FlushReason::kErrorRecovery); + mutex_.Lock(); + cfd->UnrefAndTryDelete(); + if (!s.ok()) { + break; + } + } + } + if (!s.ok()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "DB resume requested but failed due to Flush failure [%s]", + s.ToString().c_str()); + } + } + + JobContext job_context(0); + FindObsoleteFiles(&job_context, true); + if (s.ok()) { + s = error_handler_.ClearBGError(); + } + mutex_.Unlock(); + + job_context.manifest_file_number = 1; + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); + } + job_context.Clean(); + + if (s.ok()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "Successfully resumed DB"); + } + mutex_.Lock(); + // Check for shutdown again before scheduling further compactions, + // since we released and re-acquired the lock above + if (shutdown_initiated_) { + s = Status::ShutdownInProgress(); + } + if (s.ok()) { + for (auto cfd : *versions_->GetColumnFamilySet()) { + SchedulePendingCompaction(cfd); + } + MaybeScheduleFlushOrCompaction(); + } + + // Wake up any waiters - in this case, it could be the shutdown thread + bg_cv_.SignalAll(); + + // No need to check BGError again. If something happened, event listener would + // be notified and the operation causing it would have failed + return s; +} + +void DBImpl::WaitForBackgroundWork() { + // Wait for background work to finish + while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ || + bg_flush_scheduled_) { + bg_cv_.Wait(); + } +} + +// Will lock the mutex_, will wait for completion if wait is true +void DBImpl::CancelAllBackgroundWork(bool wait) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Shutdown: canceling all background work"); + + if (thread_dump_stats_ != nullptr) { + thread_dump_stats_->cancel(); + thread_dump_stats_.reset(); + } + if (thread_persist_stats_ != nullptr) { + thread_persist_stats_->cancel(); + thread_persist_stats_.reset(); + } + InstrumentedMutexLock l(&mutex_); + if (!shutting_down_.load(std::memory_order_acquire) && + has_unpersisted_data_.load(std::memory_order_relaxed) && + !mutable_db_options_.avoid_flush_during_shutdown) { + if (immutable_db_options_.atomic_flush) { + autovector cfds; + SelectColumnFamiliesForAtomicFlush(&cfds); + mutex_.Unlock(); + AtomicFlushMemTables(cfds, FlushOptions(), FlushReason::kShutDown); + mutex_.Lock(); + } else { + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (!cfd->IsDropped() && cfd->initialized() && !cfd->mem()->IsEmpty()) { + cfd->Ref(); + mutex_.Unlock(); + FlushMemTable(cfd, FlushOptions(), FlushReason::kShutDown); + mutex_.Lock(); + cfd->UnrefAndTryDelete(); + } + } + } + versions_->GetColumnFamilySet()->FreeDeadColumnFamilies(); + } + + shutting_down_.store(true, std::memory_order_release); + bg_cv_.SignalAll(); + if (!wait) { + return; + } + WaitForBackgroundWork(); +} + +Status DBImpl::CloseHelper() { + // Guarantee that there is no background error recovery in progress before + // continuing with the shutdown + mutex_.Lock(); + shutdown_initiated_ = true; + error_handler_.CancelErrorRecovery(); + while (error_handler_.IsRecoveryInProgress()) { + bg_cv_.Wait(); + } + mutex_.Unlock(); + + // CancelAllBackgroundWork called with false means we just set the shutdown + // marker. After this we do a variant of the waiting and unschedule work + // (to consider: moving all the waiting into CancelAllBackgroundWork(true)) + CancelAllBackgroundWork(false); + int bottom_compactions_unscheduled = + env_->UnSchedule(this, Env::Priority::BOTTOM); + int compactions_unscheduled = env_->UnSchedule(this, Env::Priority::LOW); + int flushes_unscheduled = env_->UnSchedule(this, Env::Priority::HIGH); + Status ret; + mutex_.Lock(); + bg_bottom_compaction_scheduled_ -= bottom_compactions_unscheduled; + bg_compaction_scheduled_ -= compactions_unscheduled; + bg_flush_scheduled_ -= flushes_unscheduled; + + // Wait for background work to finish + while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ || + bg_flush_scheduled_ || bg_purge_scheduled_ || + pending_purge_obsolete_files_ || + error_handler_.IsRecoveryInProgress()) { + TEST_SYNC_POINT("DBImpl::~DBImpl:WaitJob"); + bg_cv_.Wait(); + } + TEST_SYNC_POINT_CALLBACK("DBImpl::CloseHelper:PendingPurgeFinished", + &files_grabbed_for_purge_); + EraseThreadStatusDbInfo(); + flush_scheduler_.Clear(); + trim_history_scheduler_.Clear(); + + while (!flush_queue_.empty()) { + const FlushRequest& flush_req = PopFirstFromFlushQueue(); + for (const auto& iter : flush_req) { + iter.first->UnrefAndTryDelete(); + } + } + while (!compaction_queue_.empty()) { + auto cfd = PopFirstFromCompactionQueue(); + cfd->UnrefAndTryDelete(); + } + + if (default_cf_handle_ != nullptr || persist_stats_cf_handle_ != nullptr) { + // we need to delete handle outside of lock because it does its own locking + mutex_.Unlock(); + if (default_cf_handle_) { + delete default_cf_handle_; + default_cf_handle_ = nullptr; + } + if (persist_stats_cf_handle_) { + delete persist_stats_cf_handle_; + persist_stats_cf_handle_ = nullptr; + } + mutex_.Lock(); + } + + // Clean up obsolete files due to SuperVersion release. + // (1) Need to delete to obsolete files before closing because RepairDB() + // scans all existing files in the file system and builds manifest file. + // Keeping obsolete files confuses the repair process. + // (2) Need to check if we Open()/Recover() the DB successfully before + // deleting because if VersionSet recover fails (may be due to corrupted + // manifest file), it is not able to identify live files correctly. As a + // result, all "live" files can get deleted by accident. However, corrupted + // manifest is recoverable by RepairDB(). + if (opened_successfully_) { + JobContext job_context(next_job_id_.fetch_add(1)); + FindObsoleteFiles(&job_context, true); + + mutex_.Unlock(); + // manifest number starting from 2 + job_context.manifest_file_number = 1; + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); + } + job_context.Clean(); + mutex_.Lock(); + } + + for (auto l : logs_to_free_) { + delete l; + } + for (auto& log : logs_) { + uint64_t log_number = log.writer->get_log_number(); + Status s = log.ClearWriter(); + if (!s.ok()) { + ROCKS_LOG_WARN( + immutable_db_options_.info_log, + "Unable to Sync WAL file %s with error -- %s", + LogFileName(immutable_db_options_.wal_dir, log_number).c_str(), + s.ToString().c_str()); + // Retain the first error + if (ret.ok()) { + ret = s; + } + } + } + logs_.clear(); + + // Table cache may have table handles holding blocks from the block cache. + // We need to release them before the block cache is destroyed. The block + // cache may be destroyed inside versions_.reset(), when column family data + // list is destroyed, so leaving handles in table cache after + // versions_.reset() may cause issues. + // Here we clean all unreferenced handles in table cache. + // Now we assume all user queries have finished, so only version set itself + // can possibly hold the blocks from block cache. After releasing unreferenced + // handles here, only handles held by version set left and inside + // versions_.reset(), we will release them. There, we need to make sure every + // time a handle is released, we erase it from the cache too. By doing that, + // we can guarantee that after versions_.reset(), table cache is empty + // so the cache can be safely destroyed. + table_cache_->EraseUnRefEntries(); + + for (auto& txn_entry : recovered_transactions_) { + delete txn_entry.second; + } + + // versions need to be destroyed before table_cache since it can hold + // references to table_cache. + versions_.reset(); + mutex_.Unlock(); + if (db_lock_ != nullptr) { + env_->UnlockFile(db_lock_); + } + + ROCKS_LOG_INFO(immutable_db_options_.info_log, "Shutdown complete"); + LogFlush(immutable_db_options_.info_log); + +#ifndef ROCKSDB_LITE + // If the sst_file_manager was allocated by us during DB::Open(), ccall + // Close() on it before closing the info_log. Otherwise, background thread + // in SstFileManagerImpl might try to log something + if (immutable_db_options_.sst_file_manager && own_sfm_) { + auto sfm = static_cast( + immutable_db_options_.sst_file_manager.get()); + sfm->Close(); + } +#endif // ROCKSDB_LITE + + if (immutable_db_options_.info_log && own_info_log_) { + Status s = immutable_db_options_.info_log->Close(); + if (ret.ok()) { + ret = s; + } + } + + if (ret.IsAborted()) { + // Reserve IsAborted() error for those where users didn't release + // certain resource and they can release them and come back and + // retry. In this case, we wrap this exception to something else. + return Status::Incomplete(ret.ToString()); + } + return ret; +} + +Status DBImpl::CloseImpl() { return CloseHelper(); } + +DBImpl::~DBImpl() { + if (!closed_) { + closed_ = true; + CloseHelper(); + } +} + +void DBImpl::MaybeIgnoreError(Status* s) const { + if (s->ok() || immutable_db_options_.paranoid_checks) { + // No change needed + } else { + ROCKS_LOG_WARN(immutable_db_options_.info_log, "Ignoring error %s", + s->ToString().c_str()); + *s = Status::OK(); + } +} + +const Status DBImpl::CreateArchivalDirectory() { + if (immutable_db_options_.wal_ttl_seconds > 0 || + immutable_db_options_.wal_size_limit_mb > 0) { + std::string archivalPath = ArchivalDirectory(immutable_db_options_.wal_dir); + return env_->CreateDirIfMissing(archivalPath); + } + return Status::OK(); +} + +void DBImpl::PrintStatistics() { + auto dbstats = immutable_db_options_.statistics.get(); + if (dbstats) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "STATISTICS:\n %s", + dbstats->ToString().c_str()); + } +} + +void DBImpl::StartTimedTasks() { + unsigned int stats_dump_period_sec = 0; + unsigned int stats_persist_period_sec = 0; + { + InstrumentedMutexLock l(&mutex_); + stats_dump_period_sec = mutable_db_options_.stats_dump_period_sec; + if (stats_dump_period_sec > 0) { + if (!thread_dump_stats_) { + thread_dump_stats_.reset(new ROCKSDB_NAMESPACE::RepeatableThread( + [this]() { DBImpl::DumpStats(); }, "dump_st", env_, + static_cast(stats_dump_period_sec) * kMicrosInSecond)); + } + } + stats_persist_period_sec = mutable_db_options_.stats_persist_period_sec; + if (stats_persist_period_sec > 0) { + if (!thread_persist_stats_) { + thread_persist_stats_.reset(new ROCKSDB_NAMESPACE::RepeatableThread( + [this]() { DBImpl::PersistStats(); }, "pst_st", env_, + static_cast(stats_persist_period_sec) * kMicrosInSecond)); + } + } + } +} + +// esitmate the total size of stats_history_ +size_t DBImpl::EstimateInMemoryStatsHistorySize() const { + size_t size_total = + sizeof(std::map>); + if (stats_history_.size() == 0) return size_total; + size_t size_per_slice = + sizeof(uint64_t) + sizeof(std::map); + // non-empty map, stats_history_.begin() guaranteed to exist + std::map sample_slice(stats_history_.begin()->second); + for (const auto& pairs : sample_slice) { + size_per_slice += + pairs.first.capacity() + sizeof(pairs.first) + sizeof(pairs.second); + } + size_total = size_per_slice * stats_history_.size(); + return size_total; +} + +void DBImpl::PersistStats() { + TEST_SYNC_POINT("DBImpl::PersistStats:Entry"); +#ifndef ROCKSDB_LITE + if (shutdown_initiated_) { + return; + } + uint64_t now_seconds = env_->NowMicros() / kMicrosInSecond; + Statistics* statistics = immutable_db_options_.statistics.get(); + if (!statistics) { + return; + } + size_t stats_history_size_limit = 0; + { + InstrumentedMutexLock l(&mutex_); + stats_history_size_limit = mutable_db_options_.stats_history_buffer_size; + } + + std::map stats_map; + if (!statistics->getTickerMap(&stats_map)) { + return; + } + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "------- PERSISTING STATS -------"); + + if (immutable_db_options_.persist_stats_to_disk) { + WriteBatch batch; + if (stats_slice_initialized_) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Reading %" ROCKSDB_PRIszt " stats from statistics\n", + stats_slice_.size()); + for (const auto& stat : stats_map) { + char key[100]; + int length = + EncodePersistentStatsKey(now_seconds, stat.first, 100, key); + // calculate the delta from last time + if (stats_slice_.find(stat.first) != stats_slice_.end()) { + uint64_t delta = stat.second - stats_slice_[stat.first]; + batch.Put(persist_stats_cf_handle_, Slice(key, std::min(100, length)), + ToString(delta)); + } + } + } + stats_slice_initialized_ = true; + std::swap(stats_slice_, stats_map); + WriteOptions wo; + wo.low_pri = true; + wo.no_slowdown = true; + wo.sync = false; + Status s = Write(wo, &batch); + if (!s.ok()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Writing to persistent stats CF failed -- %s", + s.ToString().c_str()); + } else { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Writing %" ROCKSDB_PRIszt " stats with timestamp %" PRIu64 + " to persistent stats CF succeeded", + stats_slice_.size(), now_seconds); + } + // TODO(Zhongyi): add purging for persisted data + } else { + InstrumentedMutexLock l(&stats_history_mutex_); + // calculate the delta from last time + if (stats_slice_initialized_) { + std::map stats_delta; + for (const auto& stat : stats_map) { + if (stats_slice_.find(stat.first) != stats_slice_.end()) { + stats_delta[stat.first] = stat.second - stats_slice_[stat.first]; + } + } + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Storing %" ROCKSDB_PRIszt " stats with timestamp %" PRIu64 + " to in-memory stats history", + stats_slice_.size(), now_seconds); + stats_history_[now_seconds] = stats_delta; + } + stats_slice_initialized_ = true; + std::swap(stats_slice_, stats_map); + TEST_SYNC_POINT("DBImpl::PersistStats:StatsCopied"); + + // delete older stats snapshots to control memory consumption + size_t stats_history_size = EstimateInMemoryStatsHistorySize(); + bool purge_needed = stats_history_size > stats_history_size_limit; + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[Pre-GC] In-memory stats history size: %" ROCKSDB_PRIszt + " bytes, slice count: %" ROCKSDB_PRIszt, + stats_history_size, stats_history_.size()); + while (purge_needed && !stats_history_.empty()) { + stats_history_.erase(stats_history_.begin()); + purge_needed = + EstimateInMemoryStatsHistorySize() > stats_history_size_limit; + } + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[Post-GC] In-memory stats history size: %" ROCKSDB_PRIszt + " bytes, slice count: %" ROCKSDB_PRIszt, + stats_history_size, stats_history_.size()); + } +#endif // !ROCKSDB_LITE +} + +bool DBImpl::FindStatsByTime(uint64_t start_time, uint64_t end_time, + uint64_t* new_time, + std::map* stats_map) { + assert(new_time); + assert(stats_map); + if (!new_time || !stats_map) return false; + // lock when search for start_time + { + InstrumentedMutexLock l(&stats_history_mutex_); + auto it = stats_history_.lower_bound(start_time); + if (it != stats_history_.end() && it->first < end_time) { + // make a copy for timestamp and stats_map + *new_time = it->first; + *stats_map = it->second; + return true; + } else { + return false; + } + } +} + +Status DBImpl::GetStatsHistory( + uint64_t start_time, uint64_t end_time, + std::unique_ptr* stats_iterator) { + if (!stats_iterator) { + return Status::InvalidArgument("stats_iterator not preallocated."); + } + if (immutable_db_options_.persist_stats_to_disk) { + stats_iterator->reset( + new PersistentStatsHistoryIterator(start_time, end_time, this)); + } else { + stats_iterator->reset( + new InMemoryStatsHistoryIterator(start_time, end_time, this)); + } + return (*stats_iterator)->status(); +} + +void DBImpl::DumpStats() { + TEST_SYNC_POINT("DBImpl::DumpStats:1"); +#ifndef ROCKSDB_LITE + const DBPropertyInfo* cf_property_info = + GetPropertyInfo(DB::Properties::kCFStats); + assert(cf_property_info != nullptr); + const DBPropertyInfo* db_property_info = + GetPropertyInfo(DB::Properties::kDBStats); + assert(db_property_info != nullptr); + + std::string stats; + if (shutdown_initiated_) { + return; + } + { + InstrumentedMutexLock l(&mutex_); + default_cf_internal_stats_->GetStringProperty( + *db_property_info, DB::Properties::kDBStats, &stats); + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->initialized()) { + cfd->internal_stats()->GetStringProperty( + *cf_property_info, DB::Properties::kCFStatsNoFileHistogram, &stats); + } + } + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->initialized()) { + cfd->internal_stats()->GetStringProperty( + *cf_property_info, DB::Properties::kCFFileHistogram, &stats); + } + } + } + TEST_SYNC_POINT("DBImpl::DumpStats:2"); + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "------- DUMPING STATS -------"); + ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s", stats.c_str()); + if (immutable_db_options_.dump_malloc_stats) { + stats.clear(); + DumpMallocStats(&stats); + if (!stats.empty()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "------- Malloc STATS -------"); + ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s", stats.c_str()); + } + } +#endif // !ROCKSDB_LITE + + PrintStatistics(); +} + +Status DBImpl::TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family, + int max_entries_to_print, + std::string* out_str) { + auto* cfh = + static_cast_with_check( + column_family); + ColumnFamilyData* cfd = cfh->cfd(); + + SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); + Version* version = super_version->current; + + Status s = + version->TablesRangeTombstoneSummary(max_entries_to_print, out_str); + + CleanupSuperVersion(super_version); + return s; +} + +void DBImpl::ScheduleBgLogWriterClose(JobContext* job_context) { + if (!job_context->logs_to_free.empty()) { + for (auto l : job_context->logs_to_free) { + AddToLogsToFreeQueue(l); + } + job_context->logs_to_free.clear(); + } +} + +Directory* DBImpl::GetDataDir(ColumnFamilyData* cfd, size_t path_id) const { + assert(cfd); + Directory* ret_dir = cfd->GetDataDir(path_id); + if (ret_dir == nullptr) { + return directories_.GetDataDir(path_id); + } + return ret_dir; +} + +Status DBImpl::SetOptions( + ColumnFamilyHandle* column_family, + const std::unordered_map& options_map) { +#ifdef ROCKSDB_LITE + (void)column_family; + (void)options_map; + return Status::NotSupported("Not supported in ROCKSDB LITE"); +#else + auto* cfd = reinterpret_cast(column_family)->cfd(); + if (options_map.empty()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "SetOptions() on column family [%s], empty input", + cfd->GetName().c_str()); + return Status::InvalidArgument("empty input"); + } + + MutableCFOptions new_options; + Status s; + Status persist_options_status; + SuperVersionContext sv_context(/* create_superversion */ true); + { + auto db_options = GetDBOptions(); + InstrumentedMutexLock l(&mutex_); + s = cfd->SetOptions(db_options, options_map); + if (s.ok()) { + new_options = *cfd->GetLatestMutableCFOptions(); + // Append new version to recompute compaction score. + VersionEdit dummy_edit; + versions_->LogAndApply(cfd, new_options, &dummy_edit, &mutex_, + directories_.GetDbDir()); + // Trigger possible flush/compactions. This has to be before we persist + // options to file, otherwise there will be a deadlock with writer + // thread. + InstallSuperVersionAndScheduleWork(cfd, &sv_context, new_options); + + persist_options_status = WriteOptionsFile( + false /*need_mutex_lock*/, true /*need_enter_write_thread*/); + bg_cv_.SignalAll(); + } + } + sv_context.Clean(); + + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "SetOptions() on column family [%s], inputs:", cfd->GetName().c_str()); + for (const auto& o : options_map) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s: %s\n", o.first.c_str(), + o.second.c_str()); + } + if (s.ok()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[%s] SetOptions() succeeded", cfd->GetName().c_str()); + new_options.Dump(immutable_db_options_.info_log.get()); + if (!persist_options_status.ok()) { + s = persist_options_status; + } + } else { + ROCKS_LOG_WARN(immutable_db_options_.info_log, "[%s] SetOptions() failed", + cfd->GetName().c_str()); + } + LogFlush(immutable_db_options_.info_log); + return s; +#endif // ROCKSDB_LITE +} + +Status DBImpl::SetDBOptions( + const std::unordered_map& options_map) { +#ifdef ROCKSDB_LITE + (void)options_map; + return Status::NotSupported("Not supported in ROCKSDB LITE"); +#else + if (options_map.empty()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "SetDBOptions(), empty input."); + return Status::InvalidArgument("empty input"); + } + + MutableDBOptions new_options; + Status s; + Status persist_options_status; + bool wal_changed = false; + WriteContext write_context; + { + InstrumentedMutexLock l(&mutex_); + s = GetMutableDBOptionsFromStrings(mutable_db_options_, options_map, + &new_options); + if (new_options.bytes_per_sync == 0) { + new_options.bytes_per_sync = 1024 * 1024; + } + DBOptions new_db_options = + BuildDBOptions(immutable_db_options_, new_options); + if (s.ok()) { + s = ValidateOptions(new_db_options); + } + if (s.ok()) { + for (auto c : *versions_->GetColumnFamilySet()) { + if (!c->IsDropped()) { + auto cf_options = c->GetLatestCFOptions(); + s = ColumnFamilyData::ValidateOptions(new_db_options, cf_options); + if (!s.ok()) { + break; + } + } + } + } + if (s.ok()) { + const BGJobLimits current_bg_job_limits = + GetBGJobLimits(immutable_db_options_.max_background_flushes, + mutable_db_options_.max_background_compactions, + mutable_db_options_.max_background_jobs, + /* parallelize_compactions */ true); + const BGJobLimits new_bg_job_limits = GetBGJobLimits( + immutable_db_options_.max_background_flushes, + new_options.max_background_compactions, + new_options.max_background_jobs, /* parallelize_compactions */ true); + + const bool max_flushes_increased = + new_bg_job_limits.max_flushes > current_bg_job_limits.max_flushes; + const bool max_compactions_increased = + new_bg_job_limits.max_compactions > + current_bg_job_limits.max_compactions; + + if (max_flushes_increased || max_compactions_increased) { + if (max_flushes_increased) { + env_->IncBackgroundThreadsIfNeeded(new_bg_job_limits.max_flushes, + Env::Priority::HIGH); + } + + if (max_compactions_increased) { + env_->IncBackgroundThreadsIfNeeded(new_bg_job_limits.max_compactions, + Env::Priority::LOW); + } + + MaybeScheduleFlushOrCompaction(); + } + + if (new_options.stats_dump_period_sec != + mutable_db_options_.stats_dump_period_sec) { + if (thread_dump_stats_) { + mutex_.Unlock(); + thread_dump_stats_->cancel(); + mutex_.Lock(); + } + if (new_options.stats_dump_period_sec > 0) { + thread_dump_stats_.reset(new ROCKSDB_NAMESPACE::RepeatableThread( + [this]() { DBImpl::DumpStats(); }, "dump_st", env_, + static_cast(new_options.stats_dump_period_sec) * + kMicrosInSecond)); + } else { + thread_dump_stats_.reset(); + } + } + if (new_options.stats_persist_period_sec != + mutable_db_options_.stats_persist_period_sec) { + if (thread_persist_stats_) { + mutex_.Unlock(); + thread_persist_stats_->cancel(); + mutex_.Lock(); + } + if (new_options.stats_persist_period_sec > 0) { + thread_persist_stats_.reset(new ROCKSDB_NAMESPACE::RepeatableThread( + [this]() { DBImpl::PersistStats(); }, "pst_st", env_, + static_cast(new_options.stats_persist_period_sec) * + kMicrosInSecond)); + } else { + thread_persist_stats_.reset(); + } + } + write_controller_.set_max_delayed_write_rate( + new_options.delayed_write_rate); + table_cache_.get()->SetCapacity(new_options.max_open_files == -1 + ? TableCache::kInfiniteCapacity + : new_options.max_open_files - 10); + wal_changed = mutable_db_options_.wal_bytes_per_sync != + new_options.wal_bytes_per_sync; + mutable_db_options_ = new_options; + file_options_for_compaction_ = FileOptions(new_db_options); + file_options_for_compaction_ = fs_->OptimizeForCompactionTableWrite( + file_options_for_compaction_, immutable_db_options_); + versions_->ChangeFileOptions(mutable_db_options_); + //TODO(xiez): clarify why apply optimize for read to write options + file_options_for_compaction_ = fs_->OptimizeForCompactionTableRead( + file_options_for_compaction_, immutable_db_options_); + file_options_for_compaction_.compaction_readahead_size = + mutable_db_options_.compaction_readahead_size; + WriteThread::Writer w; + write_thread_.EnterUnbatched(&w, &mutex_); + if (total_log_size_ > GetMaxTotalWalSize() || wal_changed) { + Status purge_wal_status = SwitchWAL(&write_context); + if (!purge_wal_status.ok()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Unable to purge WAL files in SetDBOptions() -- %s", + purge_wal_status.ToString().c_str()); + } + } + persist_options_status = WriteOptionsFile( + false /*need_mutex_lock*/, false /*need_enter_write_thread*/); + write_thread_.ExitUnbatched(&w); + } + } + ROCKS_LOG_INFO(immutable_db_options_.info_log, "SetDBOptions(), inputs:"); + for (const auto& o : options_map) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s: %s\n", o.first.c_str(), + o.second.c_str()); + } + if (s.ok()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "SetDBOptions() succeeded"); + new_options.Dump(immutable_db_options_.info_log.get()); + if (!persist_options_status.ok()) { + if (immutable_db_options_.fail_if_options_file_error) { + s = Status::IOError( + "SetDBOptions() succeeded, but unable to persist options", + persist_options_status.ToString()); + } + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Unable to persist options in SetDBOptions() -- %s", + persist_options_status.ToString().c_str()); + } + } else { + ROCKS_LOG_WARN(immutable_db_options_.info_log, "SetDBOptions failed"); + } + LogFlush(immutable_db_options_.info_log); + return s; +#endif // ROCKSDB_LITE +} + +// return the same level if it cannot be moved +int DBImpl::FindMinimumEmptyLevelFitting( + ColumnFamilyData* cfd, const MutableCFOptions& /*mutable_cf_options*/, + int level) { + mutex_.AssertHeld(); + const auto* vstorage = cfd->current()->storage_info(); + int minimum_level = level; + for (int i = level - 1; i > 0; --i) { + // stop if level i is not empty + if (vstorage->NumLevelFiles(i) > 0) break; + // stop if level i is too small (cannot fit the level files) + if (vstorage->MaxBytesForLevel(i) < vstorage->NumLevelBytes(level)) { + break; + } + + minimum_level = i; + } + return minimum_level; +} + +Status DBImpl::FlushWAL(bool sync) { + if (manual_wal_flush_) { + Status s; + { + // We need to lock log_write_mutex_ since logs_ might change concurrently + InstrumentedMutexLock wl(&log_write_mutex_); + log::Writer* cur_log_writer = logs_.back().writer; + s = cur_log_writer->WriteBuffer(); + } + if (!s.ok()) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL flush error %s", + s.ToString().c_str()); + // In case there is a fs error we should set it globally to prevent the + // future writes + WriteStatusCheck(s); + // whether sync or not, we should abort the rest of function upon error + return s; + } + if (!sync) { + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "FlushWAL sync=false"); + return s; + } + } + if (!sync) { + return Status::OK(); + } + // sync = true + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "FlushWAL sync=true"); + return SyncWAL(); +} + +Status DBImpl::SyncWAL() { + autovector logs_to_sync; + bool need_log_dir_sync; + uint64_t current_log_number; + + { + InstrumentedMutexLock l(&mutex_); + assert(!logs_.empty()); + + // This SyncWAL() call only cares about logs up to this number. + current_log_number = logfile_number_; + + while (logs_.front().number <= current_log_number && + logs_.front().getting_synced) { + log_sync_cv_.Wait(); + } + // First check that logs are safe to sync in background. + for (auto it = logs_.begin(); + it != logs_.end() && it->number <= current_log_number; ++it) { + if (!it->writer->file()->writable_file()->IsSyncThreadSafe()) { + return Status::NotSupported( + "SyncWAL() is not supported for this implementation of WAL file", + immutable_db_options_.allow_mmap_writes + ? "try setting Options::allow_mmap_writes to false" + : Slice()); + } + } + for (auto it = logs_.begin(); + it != logs_.end() && it->number <= current_log_number; ++it) { + auto& log = *it; + assert(!log.getting_synced); + log.getting_synced = true; + logs_to_sync.push_back(log.writer); + } + + need_log_dir_sync = !log_dir_synced_; + } + + TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:1"); + RecordTick(stats_, WAL_FILE_SYNCED); + Status status; + for (log::Writer* log : logs_to_sync) { + status = log->file()->SyncWithoutFlush(immutable_db_options_.use_fsync); + if (!status.ok()) { + break; + } + } + if (status.ok() && need_log_dir_sync) { + status = directories_.GetWalDir()->Fsync(); + } + TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:2"); + + TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:1"); + { + InstrumentedMutexLock l(&mutex_); + MarkLogsSynced(current_log_number, need_log_dir_sync, status); + } + TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:2"); + + return status; +} + +Status DBImpl::LockWAL() { + log_write_mutex_.Lock(); + auto cur_log_writer = logs_.back().writer; + auto status = cur_log_writer->WriteBuffer(); + if (!status.ok()) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL flush error %s", + status.ToString().c_str()); + // In case there is a fs error we should set it globally to prevent the + // future writes + WriteStatusCheck(status); + } + return status; +} + +Status DBImpl::UnlockWAL() { + log_write_mutex_.Unlock(); + return Status::OK(); +} + +void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir, + const Status& status) { + mutex_.AssertHeld(); + if (synced_dir && logfile_number_ == up_to && status.ok()) { + log_dir_synced_ = true; + } + for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;) { + auto& log = *it; + assert(log.getting_synced); + if (status.ok() && logs_.size() > 1) { + logs_to_free_.push_back(log.ReleaseWriter()); + // To modify logs_ both mutex_ and log_write_mutex_ must be held + InstrumentedMutexLock l(&log_write_mutex_); + it = logs_.erase(it); + } else { + log.getting_synced = false; + ++it; + } + } + assert(!status.ok() || logs_.empty() || logs_[0].number > up_to || + (logs_.size() == 1 && !logs_[0].getting_synced)); + log_sync_cv_.SignalAll(); +} + +SequenceNumber DBImpl::GetLatestSequenceNumber() const { + return versions_->LastSequence(); +} + +void DBImpl::SetLastPublishedSequence(SequenceNumber seq) { + versions_->SetLastPublishedSequence(seq); +} + +bool DBImpl::SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) { + if (seqnum > preserve_deletes_seqnum_.load()) { + preserve_deletes_seqnum_.store(seqnum); + return true; + } else { + return false; + } +} + +InternalIterator* DBImpl::NewInternalIterator( + Arena* arena, RangeDelAggregator* range_del_agg, SequenceNumber sequence, + ColumnFamilyHandle* column_family) { + ColumnFamilyData* cfd; + if (column_family == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfh = reinterpret_cast(column_family); + cfd = cfh->cfd(); + } + + mutex_.Lock(); + SuperVersion* super_version = cfd->GetSuperVersion()->Ref(); + mutex_.Unlock(); + ReadOptions roptions; + return NewInternalIterator(roptions, cfd, super_version, arena, range_del_agg, + sequence); +} + +void DBImpl::SchedulePurge() { + mutex_.AssertHeld(); + assert(opened_successfully_); + + // Purge operations are put into High priority queue + bg_purge_scheduled_++; + env_->Schedule(&DBImpl::BGWorkPurge, this, Env::Priority::HIGH, nullptr); +} + +void DBImpl::BackgroundCallPurge() { + mutex_.Lock(); + + while (!logs_to_free_queue_.empty()) { + assert(!logs_to_free_queue_.empty()); + log::Writer* log_writer = *(logs_to_free_queue_.begin()); + logs_to_free_queue_.pop_front(); + mutex_.Unlock(); + delete log_writer; + mutex_.Lock(); + } + while (!superversions_to_free_queue_.empty()) { + assert(!superversions_to_free_queue_.empty()); + SuperVersion* sv = superversions_to_free_queue_.front(); + superversions_to_free_queue_.pop_front(); + mutex_.Unlock(); + delete sv; + mutex_.Lock(); + } + + // Can't use iterator to go over purge_files_ because inside the loop we're + // unlocking the mutex that protects purge_files_. + while (!purge_files_.empty()) { + auto it = purge_files_.begin(); + // Need to make a copy of the PurgeFilesInfo before unlocking the mutex. + PurgeFileInfo purge_file = it->second; + + const std::string& fname = purge_file.fname; + const std::string& dir_to_sync = purge_file.dir_to_sync; + FileType type = purge_file.type; + uint64_t number = purge_file.number; + int job_id = purge_file.job_id; + + purge_files_.erase(it); + + mutex_.Unlock(); + DeleteObsoleteFileImpl(job_id, fname, dir_to_sync, type, number); + mutex_.Lock(); + } + + bg_purge_scheduled_--; + + bg_cv_.SignalAll(); + // IMPORTANT:there should be no code after calling SignalAll. This call may + // signal the DB destructor that it's OK to proceed with destruction. In + // that case, all DB variables will be dealloacated and referencing them + // will cause trouble. + mutex_.Unlock(); +} + +namespace { +struct IterState { + IterState(DBImpl* _db, InstrumentedMutex* _mu, SuperVersion* _super_version, + bool _background_purge) + : db(_db), + mu(_mu), + super_version(_super_version), + background_purge(_background_purge) {} + + DBImpl* db; + InstrumentedMutex* mu; + SuperVersion* super_version; + bool background_purge; +}; + +static void CleanupIteratorState(void* arg1, void* /*arg2*/) { + IterState* state = reinterpret_cast(arg1); + + if (state->super_version->Unref()) { + // Job id == 0 means that this is not our background process, but rather + // user thread + JobContext job_context(0); + + state->mu->Lock(); + state->super_version->Cleanup(); + state->db->FindObsoleteFiles(&job_context, false, true); + if (state->background_purge) { + state->db->ScheduleBgLogWriterClose(&job_context); + state->db->AddSuperVersionsToFreeQueue(state->super_version); + state->db->SchedulePurge(); + } + state->mu->Unlock(); + + if (!state->background_purge) { + delete state->super_version; + } + if (job_context.HaveSomethingToDelete()) { + if (state->background_purge) { + // PurgeObsoleteFiles here does not delete files. Instead, it adds the + // files to be deleted to a job queue, and deletes it in a separate + // background thread. + state->db->PurgeObsoleteFiles(job_context, true /* schedule only */); + state->mu->Lock(); + state->db->SchedulePurge(); + state->mu->Unlock(); + } else { + state->db->PurgeObsoleteFiles(job_context); + } + } + job_context.Clean(); + } + + delete state; +} +} // namespace + +InternalIterator* DBImpl::NewInternalIterator(const ReadOptions& read_options, + ColumnFamilyData* cfd, + SuperVersion* super_version, + Arena* arena, + RangeDelAggregator* range_del_agg, + SequenceNumber sequence) { + InternalIterator* internal_iter; + assert(arena != nullptr); + assert(range_del_agg != nullptr); + // Need to create internal iterator from the arena. + MergeIteratorBuilder merge_iter_builder( + &cfd->internal_comparator(), arena, + !read_options.total_order_seek && + super_version->mutable_cf_options.prefix_extractor != nullptr); + // Collect iterator for mutable mem + merge_iter_builder.AddIterator( + super_version->mem->NewIterator(read_options, arena)); + std::unique_ptr range_del_iter; + Status s; + if (!read_options.ignore_range_deletions) { + range_del_iter.reset( + super_version->mem->NewRangeTombstoneIterator(read_options, sequence)); + range_del_agg->AddTombstones(std::move(range_del_iter)); + } + // Collect all needed child iterators for immutable memtables + if (s.ok()) { + super_version->imm->AddIterators(read_options, &merge_iter_builder); + if (!read_options.ignore_range_deletions) { + s = super_version->imm->AddRangeTombstoneIterators(read_options, arena, + range_del_agg); + } + } + TEST_SYNC_POINT_CALLBACK("DBImpl::NewInternalIterator:StatusCallback", &s); + if (s.ok()) { + // Collect iterators for files in L0 - Ln + if (read_options.read_tier != kMemtableTier) { + super_version->current->AddIterators(read_options, file_options_, + &merge_iter_builder, range_del_agg); + } + internal_iter = merge_iter_builder.Finish(); + IterState* cleanup = + new IterState(this, &mutex_, super_version, + read_options.background_purge_on_iterator_cleanup || + immutable_db_options_.avoid_unnecessary_blocking_io); + internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr); + + return internal_iter; + } else { + CleanupSuperVersion(super_version); + } + return NewErrorInternalIterator(s, arena); +} + +ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const { + return default_cf_handle_; +} + +ColumnFamilyHandle* DBImpl::PersistentStatsColumnFamily() const { + return persist_stats_cf_handle_; +} + +Status DBImpl::Get(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) { + GetImplOptions get_impl_options; + get_impl_options.column_family = column_family; + get_impl_options.value = value; + return GetImpl(read_options, key, get_impl_options); +} + +Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key, + GetImplOptions get_impl_options) { + assert(get_impl_options.value != nullptr || + get_impl_options.merge_operands != nullptr); + PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_); + StopWatch sw(env_, stats_, DB_GET); + PERF_TIMER_GUARD(get_snapshot_time); + + auto cfh = + reinterpret_cast(get_impl_options.column_family); + auto cfd = cfh->cfd(); + + if (tracer_) { + // TODO: This mutex should be removed later, to improve performance when + // tracing is enabled. + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + tracer_->Get(get_impl_options.column_family, key); + } + } + + // Acquire SuperVersion + SuperVersion* sv = GetAndRefSuperVersion(cfd); + + TEST_SYNC_POINT("DBImpl::GetImpl:1"); + TEST_SYNC_POINT("DBImpl::GetImpl:2"); + + SequenceNumber snapshot; + if (read_options.snapshot != nullptr) { + if (get_impl_options.callback) { + // Already calculated based on read_options.snapshot + snapshot = get_impl_options.callback->max_visible_seq(); + } else { + snapshot = + reinterpret_cast(read_options.snapshot)->number_; + } + } else { + // Note that the snapshot is assigned AFTER referencing the super + // version because otherwise a flush happening in between may compact away + // data for the snapshot, so the reader would see neither data that was be + // visible to the snapshot before compaction nor the newer data inserted + // afterwards. + snapshot = last_seq_same_as_publish_seq_ + ? versions_->LastSequence() + : versions_->LastPublishedSequence(); + if (get_impl_options.callback) { + // The unprep_seqs are not published for write unprepared, so it could be + // that max_visible_seq is larger. Seek to the std::max of the two. + // However, we still want our callback to contain the actual snapshot so + // that it can do the correct visibility filtering. + get_impl_options.callback->Refresh(snapshot); + + // Internally, WriteUnpreparedTxnReadCallback::Refresh would set + // max_visible_seq = max(max_visible_seq, snapshot) + // + // Currently, the commented out assert is broken by + // InvalidSnapshotReadCallback, but if write unprepared recovery followed + // the regular transaction flow, then this special read callback would not + // be needed. + // + // assert(callback->max_visible_seq() >= snapshot); + snapshot = get_impl_options.callback->max_visible_seq(); + } + } + TEST_SYNC_POINT("DBImpl::GetImpl:3"); + TEST_SYNC_POINT("DBImpl::GetImpl:4"); + + // Prepare to store a list of merge operations if merge occurs. + MergeContext merge_context; + SequenceNumber max_covering_tombstone_seq = 0; + + Status s; + // First look in the memtable, then in the immutable memtable (if any). + // s is both in/out. When in, s could either be OK or MergeInProgress. + // merge_operands will contain the sequence of merges in the latter case. + LookupKey lkey(key, snapshot, read_options.timestamp); + PERF_TIMER_STOP(get_snapshot_time); + + bool skip_memtable = (read_options.read_tier == kPersistedTier && + has_unpersisted_data_.load(std::memory_order_relaxed)); + bool done = false; + if (!skip_memtable) { + // Get value associated with key + if (get_impl_options.get_value) { + if (sv->mem->Get(lkey, get_impl_options.value->GetSelf(), &s, + &merge_context, &max_covering_tombstone_seq, + read_options, get_impl_options.callback, + get_impl_options.is_blob_index)) { + done = true; + get_impl_options.value->PinSelf(); + RecordTick(stats_, MEMTABLE_HIT); + } else if ((s.ok() || s.IsMergeInProgress()) && + sv->imm->Get(lkey, get_impl_options.value->GetSelf(), &s, + &merge_context, &max_covering_tombstone_seq, + read_options, get_impl_options.callback, + get_impl_options.is_blob_index)) { + done = true; + get_impl_options.value->PinSelf(); + RecordTick(stats_, MEMTABLE_HIT); + } + } else { + // Get Merge Operands associated with key, Merge Operands should not be + // merged and raw values should be returned to the user. + if (sv->mem->Get(lkey, nullptr, &s, &merge_context, + &max_covering_tombstone_seq, read_options, nullptr, + nullptr, false)) { + done = true; + RecordTick(stats_, MEMTABLE_HIT); + } else if ((s.ok() || s.IsMergeInProgress()) && + sv->imm->GetMergeOperands(lkey, &s, &merge_context, + &max_covering_tombstone_seq, + read_options)) { + done = true; + RecordTick(stats_, MEMTABLE_HIT); + } + } + if (!done && !s.ok() && !s.IsMergeInProgress()) { + ReturnAndCleanupSuperVersion(cfd, sv); + return s; + } + } + if (!done) { + PERF_TIMER_GUARD(get_from_output_files_time); + sv->current->Get( + read_options, lkey, get_impl_options.value, &s, &merge_context, + &max_covering_tombstone_seq, + get_impl_options.get_value ? get_impl_options.value_found : nullptr, + nullptr, nullptr, + get_impl_options.get_value ? get_impl_options.callback : nullptr, + get_impl_options.get_value ? get_impl_options.is_blob_index : nullptr, + get_impl_options.get_value); + RecordTick(stats_, MEMTABLE_MISS); + } + + { + PERF_TIMER_GUARD(get_post_process_time); + + ReturnAndCleanupSuperVersion(cfd, sv); + + RecordTick(stats_, NUMBER_KEYS_READ); + size_t size = 0; + if (s.ok()) { + if (get_impl_options.get_value) { + size = get_impl_options.value->size(); + } else { + // Return all merge operands for get_impl_options.key + *get_impl_options.number_of_operands = + static_cast(merge_context.GetNumOperands()); + if (*get_impl_options.number_of_operands > + get_impl_options.get_merge_operands_options + ->expected_max_number_of_operands) { + s = Status::Incomplete( + Status::SubCode::KMergeOperandsInsufficientCapacity); + } else { + for (const Slice& sl : merge_context.GetOperands()) { + size += sl.size(); + get_impl_options.merge_operands->PinSelf(sl); + get_impl_options.merge_operands++; + } + } + } + RecordTick(stats_, BYTES_READ, size); + PERF_COUNTER_ADD(get_read_bytes, size); + } + RecordInHistogram(stats_, BYTES_PER_READ, size); + } + return s; +} + +std::vector DBImpl::MultiGet( + const ReadOptions& read_options, + const std::vector& column_family, + const std::vector& keys, std::vector* values) { + PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_); + StopWatch sw(env_, stats_, DB_MULTIGET); + PERF_TIMER_GUARD(get_snapshot_time); + + SequenceNumber consistent_seqnum; + ; + + std::unordered_map multiget_cf_data( + column_family.size()); + for (auto cf : column_family) { + auto cfh = reinterpret_cast(cf); + auto cfd = cfh->cfd(); + if (multiget_cf_data.find(cfd->GetID()) == multiget_cf_data.end()) { + multiget_cf_data.emplace(cfd->GetID(), + MultiGetColumnFamilyData(cfh, nullptr)); + } + } + + std::function::iterator&)> + iter_deref_lambda = + [](std::unordered_map::iterator& + cf_iter) { return &cf_iter->second; }; + + bool unref_only = + MultiCFSnapshot>( + read_options, nullptr, iter_deref_lambda, &multiget_cf_data, + &consistent_seqnum); + + // Contain a list of merge operations if merge occurs. + MergeContext merge_context; + + // Note: this always resizes the values array + size_t num_keys = keys.size(); + std::vector stat_list(num_keys); + values->resize(num_keys); + + // Keep track of bytes that we read for statistics-recording later + uint64_t bytes_read = 0; + PERF_TIMER_STOP(get_snapshot_time); + + // For each of the given keys, apply the entire "get" process as follows: + // First look in the memtable, then in the immutable memtable (if any). + // s is both in/out. When in, s could either be OK or MergeInProgress. + // merge_operands will contain the sequence of merges in the latter case. + size_t num_found = 0; + for (size_t i = 0; i < num_keys; ++i) { + merge_context.Clear(); + Status& s = stat_list[i]; + std::string* value = &(*values)[i]; + + LookupKey lkey(keys[i], consistent_seqnum); + auto cfh = reinterpret_cast(column_family[i]); + SequenceNumber max_covering_tombstone_seq = 0; + auto mgd_iter = multiget_cf_data.find(cfh->cfd()->GetID()); + assert(mgd_iter != multiget_cf_data.end()); + auto mgd = mgd_iter->second; + auto super_version = mgd.super_version; + bool skip_memtable = + (read_options.read_tier == kPersistedTier && + has_unpersisted_data_.load(std::memory_order_relaxed)); + bool done = false; + if (!skip_memtable) { + if (super_version->mem->Get(lkey, value, &s, &merge_context, + &max_covering_tombstone_seq, read_options)) { + done = true; + RecordTick(stats_, MEMTABLE_HIT); + } else if (super_version->imm->Get(lkey, value, &s, &merge_context, + &max_covering_tombstone_seq, + read_options)) { + done = true; + RecordTick(stats_, MEMTABLE_HIT); + } + } + if (!done) { + PinnableSlice pinnable_val; + PERF_TIMER_GUARD(get_from_output_files_time); + super_version->current->Get(read_options, lkey, &pinnable_val, &s, + &merge_context, &max_covering_tombstone_seq); + value->assign(pinnable_val.data(), pinnable_val.size()); + RecordTick(stats_, MEMTABLE_MISS); + } + + if (s.ok()) { + bytes_read += value->size(); + num_found++; + } + } + + // Post processing (decrement reference counts and record statistics) + PERF_TIMER_GUARD(get_post_process_time); + autovector superversions_to_delete; + + for (auto mgd_iter : multiget_cf_data) { + auto mgd = mgd_iter.second; + if (!unref_only) { + ReturnAndCleanupSuperVersion(mgd.cfd, mgd.super_version); + } else { + mgd.cfd->GetSuperVersion()->Unref(); + } + } + RecordTick(stats_, NUMBER_MULTIGET_CALLS); + RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys); + RecordTick(stats_, NUMBER_MULTIGET_KEYS_FOUND, num_found); + RecordTick(stats_, NUMBER_MULTIGET_BYTES_READ, bytes_read); + RecordInHistogram(stats_, BYTES_PER_MULTIGET, bytes_read); + PERF_COUNTER_ADD(multiget_read_bytes, bytes_read); + PERF_TIMER_STOP(get_post_process_time); + + return stat_list; +} + +template +bool DBImpl::MultiCFSnapshot( + const ReadOptions& read_options, ReadCallback* callback, + std::function& + iter_deref_func, + T* cf_list, SequenceNumber* snapshot) { + PERF_TIMER_GUARD(get_snapshot_time); + + bool last_try = false; + if (cf_list->size() == 1) { + // Fast path for a single column family. We can simply get the thread loca + // super version + auto cf_iter = cf_list->begin(); + auto node = iter_deref_func(cf_iter); + node->super_version = GetAndRefSuperVersion(node->cfd); + if (read_options.snapshot != nullptr) { + // Note: In WritePrepared txns this is not necessary but not harmful + // either. Because prep_seq > snapshot => commit_seq > snapshot so if + // a snapshot is specified we should be fine with skipping seq numbers + // that are greater than that. + // + // In WriteUnprepared, we cannot set snapshot in the lookup key because we + // may skip uncommitted data that should be visible to the transaction for + // reading own writes. + *snapshot = + static_cast(read_options.snapshot)->number_; + if (callback) { + *snapshot = std::max(*snapshot, callback->max_visible_seq()); + } + } else { + // Since we get and reference the super version before getting + // the snapshot number, without a mutex protection, it is possible + // that a memtable switch happened in the middle and not all the + // data for this snapshot is available. But it will contain all + // the data available in the super version we have, which is also + // a valid snapshot to read from. + // We shouldn't get snapshot before finding and referencing the super + // version because a flush happening in between may compact away data for + // the snapshot, but the snapshot is earlier than the data overwriting it, + // so users may see wrong results. + *snapshot = last_seq_same_as_publish_seq_ + ? versions_->LastSequence() + : versions_->LastPublishedSequence(); + } + } else { + // If we end up with the same issue of memtable geting sealed during 2 + // consecutive retries, it means the write rate is very high. In that case + // its probably ok to take the mutex on the 3rd try so we can succeed for + // sure + static const int num_retries = 3; + for (int i = 0; i < num_retries; ++i) { + last_try = (i == num_retries - 1); + bool retry = false; + + if (i > 0) { + for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end(); + ++cf_iter) { + auto node = iter_deref_func(cf_iter); + SuperVersion* super_version = node->super_version; + ColumnFamilyData* cfd = node->cfd; + if (super_version != nullptr) { + ReturnAndCleanupSuperVersion(cfd, super_version); + } + node->super_version = nullptr; + } + } + if (read_options.snapshot == nullptr) { + if (last_try) { + TEST_SYNC_POINT("DBImpl::MultiGet::LastTry"); + // We're close to max number of retries. For the last retry, + // acquire the lock so we're sure to succeed + mutex_.Lock(); + } + *snapshot = last_seq_same_as_publish_seq_ + ? versions_->LastSequence() + : versions_->LastPublishedSequence(); + } else { + *snapshot = reinterpret_cast(read_options.snapshot) + ->number_; + } + for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end(); + ++cf_iter) { + auto node = iter_deref_func(cf_iter); + if (!last_try) { + node->super_version = GetAndRefSuperVersion(node->cfd); + } else { + node->super_version = node->cfd->GetSuperVersion()->Ref(); + } + TEST_SYNC_POINT("DBImpl::MultiGet::AfterRefSV"); + if (read_options.snapshot != nullptr || last_try) { + // If user passed a snapshot, then we don't care if a memtable is + // sealed or compaction happens because the snapshot would ensure + // that older key versions are kept around. If this is the last + // retry, then we have the lock so nothing bad can happen + continue; + } + // We could get the earliest sequence number for the whole list of + // memtables, which will include immutable memtables as well, but that + // might be tricky to maintain in case we decide, in future, to do + // memtable compaction. + if (!last_try) { + SequenceNumber seq = + node->super_version->mem->GetEarliestSequenceNumber(); + if (seq > *snapshot) { + retry = true; + break; + } + } + } + if (!retry) { + if (last_try) { + mutex_.Unlock(); + } + break; + } + } + } + + // Keep track of bytes that we read for statistics-recording later + PERF_TIMER_STOP(get_snapshot_time); + + return last_try; +} + +void DBImpl::MultiGet(const ReadOptions& read_options, const size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool sorted_input) { + if (num_keys == 0) { + return; + } + autovector key_context; + autovector sorted_keys; + sorted_keys.resize(num_keys); + for (size_t i = 0; i < num_keys; ++i) { + key_context.emplace_back(column_families[i], keys[i], &values[i], + &statuses[i]); + } + for (size_t i = 0; i < num_keys; ++i) { + sorted_keys[i] = &key_context[i]; + } + PrepareMultiGetKeys(num_keys, sorted_input, &sorted_keys); + + autovector + multiget_cf_data; + size_t cf_start = 0; + ColumnFamilyHandle* cf = sorted_keys[0]->column_family; + for (size_t i = 0; i < num_keys; ++i) { + KeyContext* key_ctx = sorted_keys[i]; + if (key_ctx->column_family != cf) { + multiget_cf_data.emplace_back( + MultiGetColumnFamilyData(cf, cf_start, i - cf_start, nullptr)); + cf_start = i; + cf = key_ctx->column_family; + } + } + { + // multiget_cf_data.emplace_back( + // MultiGetColumnFamilyData(cf, cf_start, num_keys - cf_start, nullptr)); + multiget_cf_data.emplace_back(cf, cf_start, num_keys - cf_start, nullptr); + } + std::function::iterator&)> + iter_deref_lambda = + [](autovector::iterator& cf_iter) { + return &(*cf_iter); + }; + + SequenceNumber consistent_seqnum; + bool unref_only = MultiCFSnapshot< + autovector>( + read_options, nullptr, iter_deref_lambda, &multiget_cf_data, + &consistent_seqnum); + + for (auto cf_iter = multiget_cf_data.begin(); + cf_iter != multiget_cf_data.end(); ++cf_iter) { + MultiGetImpl(read_options, cf_iter->start, cf_iter->num_keys, &sorted_keys, + cf_iter->super_version, consistent_seqnum, nullptr, nullptr); + if (!unref_only) { + ReturnAndCleanupSuperVersion(cf_iter->cfd, cf_iter->super_version); + } else { + cf_iter->cfd->GetSuperVersion()->Unref(); + } + } +} + +namespace { +// Order keys by CF ID, followed by key contents +struct CompareKeyContext { + inline bool operator()(const KeyContext* lhs, const KeyContext* rhs) { + ColumnFamilyHandleImpl* cfh = + static_cast(lhs->column_family); + uint32_t cfd_id1 = cfh->cfd()->GetID(); + const Comparator* comparator = cfh->cfd()->user_comparator(); + cfh = static_cast(lhs->column_family); + uint32_t cfd_id2 = cfh->cfd()->GetID(); + + if (cfd_id1 < cfd_id2) { + return true; + } else if (cfd_id1 > cfd_id2) { + return false; + } + + // Both keys are from the same column family + int cmp = comparator->Compare(*(lhs->key), *(rhs->key)); + if (cmp < 0) { + return true; + } + return false; + } +}; + +} // anonymous namespace + +void DBImpl::PrepareMultiGetKeys( + size_t num_keys, bool sorted_input, + autovector* sorted_keys) { +#ifndef NDEBUG + if (sorted_input) { + for (size_t index = 0; index < sorted_keys->size(); ++index) { + if (index > 0) { + KeyContext* lhs = (*sorted_keys)[index - 1]; + KeyContext* rhs = (*sorted_keys)[index]; + ColumnFamilyHandleImpl* cfh = + reinterpret_cast(lhs->column_family); + uint32_t cfd_id1 = cfh->cfd()->GetID(); + const Comparator* comparator = cfh->cfd()->user_comparator(); + cfh = reinterpret_cast(lhs->column_family); + uint32_t cfd_id2 = cfh->cfd()->GetID(); + + assert(cfd_id1 <= cfd_id2); + if (cfd_id1 < cfd_id2) { + continue; + } + + // Both keys are from the same column family + int cmp = comparator->Compare(*(lhs->key), *(rhs->key)); + assert(cmp <= 0); + } + index++; + } + } +#endif + if (!sorted_input) { + CompareKeyContext sort_comparator; + std::sort(sorted_keys->begin(), sorted_keys->begin() + num_keys, + sort_comparator); + } +} + +void DBImpl::MultiGet(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const size_t num_keys, + const Slice* keys, PinnableSlice* values, + Status* statuses, const bool sorted_input) { + autovector key_context; + autovector sorted_keys; + sorted_keys.resize(num_keys); + for (size_t i = 0; i < num_keys; ++i) { + key_context.emplace_back(column_family, keys[i], &values[i], &statuses[i]); + } + for (size_t i = 0; i < num_keys; ++i) { + sorted_keys[i] = &key_context[i]; + } + PrepareMultiGetKeys(num_keys, sorted_input, &sorted_keys); + MultiGetWithCallback(read_options, column_family, nullptr, &sorted_keys); +} + +void DBImpl::MultiGetWithCallback( + const ReadOptions& read_options, ColumnFamilyHandle* column_family, + ReadCallback* callback, + autovector* sorted_keys) { + std::array multiget_cf_data; + multiget_cf_data[0] = MultiGetColumnFamilyData(column_family, nullptr); + std::function::iterator&)> + iter_deref_lambda = + [](std::array::iterator& cf_iter) { + return &(*cf_iter); + }; + + size_t num_keys = sorted_keys->size(); + SequenceNumber consistent_seqnum; + bool unref_only = MultiCFSnapshot>( + read_options, callback, iter_deref_lambda, &multiget_cf_data, + &consistent_seqnum); +#ifndef NDEBUG + assert(!unref_only); +#else + // Silence unused variable warning + (void)unref_only; +#endif // NDEBUG + + if (callback && read_options.snapshot == nullptr) { + // The unprep_seqs are not published for write unprepared, so it could be + // that max_visible_seq is larger. Seek to the std::max of the two. + // However, we still want our callback to contain the actual snapshot so + // that it can do the correct visibility filtering. + callback->Refresh(consistent_seqnum); + + // Internally, WriteUnpreparedTxnReadCallback::Refresh would set + // max_visible_seq = max(max_visible_seq, snapshot) + // + // Currently, the commented out assert is broken by + // InvalidSnapshotReadCallback, but if write unprepared recovery followed + // the regular transaction flow, then this special read callback would not + // be needed. + // + // assert(callback->max_visible_seq() >= snapshot); + consistent_seqnum = callback->max_visible_seq(); + } + + MultiGetImpl(read_options, 0, num_keys, sorted_keys, + multiget_cf_data[0].super_version, consistent_seqnum, nullptr, + nullptr); + ReturnAndCleanupSuperVersion(multiget_cf_data[0].cfd, + multiget_cf_data[0].super_version); +} + +void DBImpl::MultiGetImpl( + const ReadOptions& read_options, size_t start_key, size_t num_keys, + autovector* sorted_keys, + SuperVersion* super_version, SequenceNumber snapshot, + ReadCallback* callback, bool* is_blob_index) { + PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_); + StopWatch sw(env_, stats_, DB_MULTIGET); + + // For each of the given keys, apply the entire "get" process as follows: + // First look in the memtable, then in the immutable memtable (if any). + // s is both in/out. When in, s could either be OK or MergeInProgress. + // merge_operands will contain the sequence of merges in the latter case. + size_t keys_left = num_keys; + while (keys_left) { + size_t batch_size = (keys_left > MultiGetContext::MAX_BATCH_SIZE) + ? MultiGetContext::MAX_BATCH_SIZE + : keys_left; + MultiGetContext ctx(sorted_keys, start_key + num_keys - keys_left, + batch_size, snapshot); + MultiGetRange range = ctx.GetMultiGetRange(); + bool lookup_current = false; + + keys_left -= batch_size; + for (auto mget_iter = range.begin(); mget_iter != range.end(); + ++mget_iter) { + mget_iter->merge_context.Clear(); + *mget_iter->s = Status::OK(); + } + + bool skip_memtable = + (read_options.read_tier == kPersistedTier && + has_unpersisted_data_.load(std::memory_order_relaxed)); + if (!skip_memtable) { + super_version->mem->MultiGet(read_options, &range, callback, + is_blob_index); + if (!range.empty()) { + super_version->imm->MultiGet(read_options, &range, callback, + is_blob_index); + } + if (!range.empty()) { + lookup_current = true; + uint64_t left = range.KeysLeft(); + RecordTick(stats_, MEMTABLE_MISS, left); + } + } + if (lookup_current) { + PERF_TIMER_GUARD(get_from_output_files_time); + super_version->current->MultiGet(read_options, &range, callback, + is_blob_index); + } + } + + // Post processing (decrement reference counts and record statistics) + PERF_TIMER_GUARD(get_post_process_time); + size_t num_found = 0; + uint64_t bytes_read = 0; + for (size_t i = start_key; i < start_key + num_keys; ++i) { + KeyContext* key = (*sorted_keys)[i]; + if (key->s->ok()) { + bytes_read += key->value->size(); + num_found++; + } + } + + RecordTick(stats_, NUMBER_MULTIGET_CALLS); + RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys); + RecordTick(stats_, NUMBER_MULTIGET_KEYS_FOUND, num_found); + RecordTick(stats_, NUMBER_MULTIGET_BYTES_READ, bytes_read); + RecordInHistogram(stats_, BYTES_PER_MULTIGET, bytes_read); + PERF_COUNTER_ADD(multiget_read_bytes, bytes_read); + PERF_TIMER_STOP(get_post_process_time); +} + +Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options, + const std::string& column_family, + ColumnFamilyHandle** handle) { + assert(handle != nullptr); + Status s = CreateColumnFamilyImpl(cf_options, column_family, handle); + if (s.ok()) { + s = WriteOptionsFile(true /*need_mutex_lock*/, + true /*need_enter_write_thread*/); + } + return s; +} + +Status DBImpl::CreateColumnFamilies( + const ColumnFamilyOptions& cf_options, + const std::vector& column_family_names, + std::vector* handles) { + assert(handles != nullptr); + handles->clear(); + size_t num_cf = column_family_names.size(); + Status s; + bool success_once = false; + for (size_t i = 0; i < num_cf; i++) { + ColumnFamilyHandle* handle; + s = CreateColumnFamilyImpl(cf_options, column_family_names[i], &handle); + if (!s.ok()) { + break; + } + handles->push_back(handle); + success_once = true; + } + if (success_once) { + Status persist_options_status = WriteOptionsFile( + true /*need_mutex_lock*/, true /*need_enter_write_thread*/); + if (s.ok() && !persist_options_status.ok()) { + s = persist_options_status; + } + } + return s; +} + +Status DBImpl::CreateColumnFamilies( + const std::vector& column_families, + std::vector* handles) { + assert(handles != nullptr); + handles->clear(); + size_t num_cf = column_families.size(); + Status s; + bool success_once = false; + for (size_t i = 0; i < num_cf; i++) { + ColumnFamilyHandle* handle; + s = CreateColumnFamilyImpl(column_families[i].options, + column_families[i].name, &handle); + if (!s.ok()) { + break; + } + handles->push_back(handle); + success_once = true; + } + if (success_once) { + Status persist_options_status = WriteOptionsFile( + true /*need_mutex_lock*/, true /*need_enter_write_thread*/); + if (s.ok() && !persist_options_status.ok()) { + s = persist_options_status; + } + } + return s; +} + +Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options, + const std::string& column_family_name, + ColumnFamilyHandle** handle) { + Status s; + Status persist_options_status; + *handle = nullptr; + + DBOptions db_options = + BuildDBOptions(immutable_db_options_, mutable_db_options_); + s = ColumnFamilyData::ValidateOptions(db_options, cf_options); + if (s.ok()) { + for (auto& cf_path : cf_options.cf_paths) { + s = env_->CreateDirIfMissing(cf_path.path); + if (!s.ok()) { + break; + } + } + } + if (!s.ok()) { + return s; + } + + SuperVersionContext sv_context(/* create_superversion */ true); + { + InstrumentedMutexLock l(&mutex_); + + if (versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name) != + nullptr) { + return Status::InvalidArgument("Column family already exists"); + } + VersionEdit edit; + edit.AddColumnFamily(column_family_name); + uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID(); + edit.SetColumnFamily(new_id); + edit.SetLogNumber(logfile_number_); + edit.SetComparatorName(cf_options.comparator->Name()); + + // LogAndApply will both write the creation in MANIFEST and create + // ColumnFamilyData object + { // write thread + WriteThread::Writer w; + write_thread_.EnterUnbatched(&w, &mutex_); + // LogAndApply will both write the creation in MANIFEST and create + // ColumnFamilyData object + s = versions_->LogAndApply(nullptr, MutableCFOptions(cf_options), &edit, + &mutex_, directories_.GetDbDir(), false, + &cf_options); + write_thread_.ExitUnbatched(&w); + } + if (s.ok()) { + auto* cfd = + versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name); + assert(cfd != nullptr); + std::map> dummy_created_dirs; + s = cfd->AddDirectories(&dummy_created_dirs); + } + if (s.ok()) { + single_column_family_mode_ = false; + auto* cfd = + versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name); + assert(cfd != nullptr); + InstallSuperVersionAndScheduleWork(cfd, &sv_context, + *cfd->GetLatestMutableCFOptions()); + + if (!cfd->mem()->IsSnapshotSupported()) { + is_snapshot_supported_ = false; + } + + cfd->set_initialized(); + + *handle = new ColumnFamilyHandleImpl(cfd, this, &mutex_); + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Created column family [%s] (ID %u)", + column_family_name.c_str(), (unsigned)cfd->GetID()); + } else { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Creating column family [%s] FAILED -- %s", + column_family_name.c_str(), s.ToString().c_str()); + } + } // InstrumentedMutexLock l(&mutex_) + + sv_context.Clean(); + // this is outside the mutex + if (s.ok()) { + NewThreadStatusCfInfo( + reinterpret_cast(*handle)->cfd()); + } + return s; +} + +Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) { + assert(column_family != nullptr); + Status s = DropColumnFamilyImpl(column_family); + if (s.ok()) { + s = WriteOptionsFile(true /*need_mutex_lock*/, + true /*need_enter_write_thread*/); + } + return s; +} + +Status DBImpl::DropColumnFamilies( + const std::vector& column_families) { + Status s; + bool success_once = false; + for (auto* handle : column_families) { + s = DropColumnFamilyImpl(handle); + if (!s.ok()) { + break; + } + success_once = true; + } + if (success_once) { + Status persist_options_status = WriteOptionsFile( + true /*need_mutex_lock*/, true /*need_enter_write_thread*/); + if (s.ok() && !persist_options_status.ok()) { + s = persist_options_status; + } + } + return s; +} + +Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) { + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + if (cfd->GetID() == 0) { + return Status::InvalidArgument("Can't drop default column family"); + } + + bool cf_support_snapshot = cfd->mem()->IsSnapshotSupported(); + + VersionEdit edit; + edit.DropColumnFamily(); + edit.SetColumnFamily(cfd->GetID()); + + Status s; + { + InstrumentedMutexLock l(&mutex_); + if (cfd->IsDropped()) { + s = Status::InvalidArgument("Column family already dropped!\n"); + } + if (s.ok()) { + // we drop column family from a single write thread + WriteThread::Writer w; + write_thread_.EnterUnbatched(&w, &mutex_); + s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), &edit, + &mutex_); + write_thread_.ExitUnbatched(&w); + } + if (s.ok()) { + auto* mutable_cf_options = cfd->GetLatestMutableCFOptions(); + max_total_in_memory_state_ -= mutable_cf_options->write_buffer_size * + mutable_cf_options->max_write_buffer_number; + } + + if (!cf_support_snapshot) { + // Dropped Column Family doesn't support snapshot. Need to recalculate + // is_snapshot_supported_. + bool new_is_snapshot_supported = true; + for (auto c : *versions_->GetColumnFamilySet()) { + if (!c->IsDropped() && !c->mem()->IsSnapshotSupported()) { + new_is_snapshot_supported = false; + break; + } + } + is_snapshot_supported_ = new_is_snapshot_supported; + } + bg_cv_.SignalAll(); + } + + if (s.ok()) { + // Note that here we erase the associated cf_info of the to-be-dropped + // cfd before its ref-count goes to zero to avoid having to erase cf_info + // later inside db_mutex. + EraseThreadStatusCfInfo(cfd); + assert(cfd->IsDropped()); + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Dropped column family with id %u\n", cfd->GetID()); + } else { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Dropping column family with id %u FAILED -- %s\n", + cfd->GetID(), s.ToString().c_str()); + } + + return s; +} + +bool DBImpl::KeyMayExist(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, bool* value_found) { + assert(value != nullptr); + if (value_found != nullptr) { + // falsify later if key-may-exist but can't fetch value + *value_found = true; + } + ReadOptions roptions = read_options; + roptions.read_tier = kBlockCacheTier; // read from block cache only + PinnableSlice pinnable_val; + GetImplOptions get_impl_options; + get_impl_options.column_family = column_family; + get_impl_options.value = &pinnable_val; + get_impl_options.value_found = value_found; + auto s = GetImpl(roptions, key, get_impl_options); + value->assign(pinnable_val.data(), pinnable_val.size()); + + // If block_cache is enabled and the index block of the table didn't + // not present in block_cache, the return value will be Status::Incomplete. + // In this case, key may still exist in the table. + return s.ok() || s.IsIncomplete(); +} + +Iterator* DBImpl::NewIterator(const ReadOptions& read_options, + ColumnFamilyHandle* column_family) { + if (read_options.managed) { + return NewErrorIterator( + Status::NotSupported("Managed iterator is not supported anymore.")); + } + Iterator* result = nullptr; + if (read_options.read_tier == kPersistedTier) { + return NewErrorIterator(Status::NotSupported( + "ReadTier::kPersistedData is not yet supported in iterators.")); + } + // if iterator wants internal keys, we can only proceed if + // we can guarantee the deletes haven't been processed yet + if (immutable_db_options_.preserve_deletes && + read_options.iter_start_seqnum > 0 && + read_options.iter_start_seqnum < preserve_deletes_seqnum_.load()) { + return NewErrorIterator(Status::InvalidArgument( + "Iterator requested internal keys which are too old and are not" + " guaranteed to be preserved, try larger iter_start_seqnum opt.")); + } + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + ReadCallback* read_callback = nullptr; // No read callback provided. + if (read_options.tailing) { +#ifdef ROCKSDB_LITE + // not supported in lite version + result = nullptr; + +#else + SuperVersion* sv = cfd->GetReferencedSuperVersion(this); + auto iter = new ForwardIterator(this, read_options, cfd, sv); + result = NewDBIterator( + env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, + cfd->user_comparator(), iter, kMaxSequenceNumber, + sv->mutable_cf_options.max_sequential_skip_in_iterations, read_callback, + this, cfd); +#endif + } else { + // Note: no need to consider the special case of + // last_seq_same_as_publish_seq_==false since NewIterator is overridden in + // WritePreparedTxnDB + auto snapshot = read_options.snapshot != nullptr + ? read_options.snapshot->GetSequenceNumber() + : versions_->LastSequence(); + result = NewIteratorImpl(read_options, cfd, snapshot, read_callback); + } + return result; +} + +ArenaWrappedDBIter* DBImpl::NewIteratorImpl(const ReadOptions& read_options, + ColumnFamilyData* cfd, + SequenceNumber snapshot, + ReadCallback* read_callback, + bool allow_blob, + bool allow_refresh) { + SuperVersion* sv = cfd->GetReferencedSuperVersion(this); + + // Try to generate a DB iterator tree in continuous memory area to be + // cache friendly. Here is an example of result: + // +-------------------------------+ + // | | + // | ArenaWrappedDBIter | + // | + | + // | +---> Inner Iterator ------------+ + // | | | | + // | | +-- -- -- -- -- -- -- --+ | + // | +--- | Arena | | + // | | | | + // | Allocated Memory: | | + // | | +-------------------+ | + // | | | DBIter | <---+ + // | | + | + // | | | +-> iter_ ------------+ + // | | | | | + // | | +-------------------+ | + // | | | MergingIterator | <---+ + // | | + | + // | | | +->child iter1 ------------+ + // | | | | | | + // | | +->child iter2 ----------+ | + // | | | | | | | + // | | | +->child iter3 --------+ | | + // | | | | | | + // | | +-------------------+ | | | + // | | | Iterator1 | <--------+ + // | | +-------------------+ | | + // | | | Iterator2 | <------+ + // | | +-------------------+ | + // | | | Iterator3 | <----+ + // | | +-------------------+ + // | | | + // +-------+-----------------------+ + // + // ArenaWrappedDBIter inlines an arena area where all the iterators in + // the iterator tree are allocated in the order of being accessed when + // querying. + // Laying out the iterators in the order of being accessed makes it more + // likely that any iterator pointer is close to the iterator it points to so + // that they are likely to be in the same cache line and/or page. + ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator( + env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, snapshot, + sv->mutable_cf_options.max_sequential_skip_in_iterations, + sv->version_number, read_callback, this, cfd, allow_blob, + read_options.snapshot != nullptr ? false : allow_refresh); + + InternalIterator* internal_iter = + NewInternalIterator(read_options, cfd, sv, db_iter->GetArena(), + db_iter->GetRangeDelAggregator(), snapshot); + db_iter->SetIterUnderDBIter(internal_iter); + + return db_iter; +} + +Status DBImpl::NewIterators( + const ReadOptions& read_options, + const std::vector& column_families, + std::vector* iterators) { + if (read_options.managed) { + return Status::NotSupported("Managed iterator is not supported anymore."); + } + if (read_options.read_tier == kPersistedTier) { + return Status::NotSupported( + "ReadTier::kPersistedData is not yet supported in iterators."); + } + ReadCallback* read_callback = nullptr; // No read callback provided. + iterators->clear(); + iterators->reserve(column_families.size()); + if (read_options.tailing) { +#ifdef ROCKSDB_LITE + return Status::InvalidArgument( + "Tailing iterator not supported in RocksDB lite"); +#else + for (auto cfh : column_families) { + auto cfd = reinterpret_cast(cfh)->cfd(); + SuperVersion* sv = cfd->GetReferencedSuperVersion(this); + auto iter = new ForwardIterator(this, read_options, cfd, sv); + iterators->push_back(NewDBIterator( + env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, + cfd->user_comparator(), iter, kMaxSequenceNumber, + sv->mutable_cf_options.max_sequential_skip_in_iterations, + read_callback, this, cfd)); + } +#endif + } else { + // Note: no need to consider the special case of + // last_seq_same_as_publish_seq_==false since NewIterators is overridden in + // WritePreparedTxnDB + auto snapshot = read_options.snapshot != nullptr + ? read_options.snapshot->GetSequenceNumber() + : versions_->LastSequence(); + for (size_t i = 0; i < column_families.size(); ++i) { + auto* cfd = + reinterpret_cast(column_families[i])->cfd(); + iterators->push_back( + NewIteratorImpl(read_options, cfd, snapshot, read_callback)); + } + } + + return Status::OK(); +} + +const Snapshot* DBImpl::GetSnapshot() { return GetSnapshotImpl(false); } + +#ifndef ROCKSDB_LITE +const Snapshot* DBImpl::GetSnapshotForWriteConflictBoundary() { + return GetSnapshotImpl(true); +} +#endif // ROCKSDB_LITE + +SnapshotImpl* DBImpl::GetSnapshotImpl(bool is_write_conflict_boundary, + bool lock) { + int64_t unix_time = 0; + env_->GetCurrentTime(&unix_time); // Ignore error + SnapshotImpl* s = new SnapshotImpl; + + if (lock) { + mutex_.Lock(); + } + // returns null if the underlying memtable does not support snapshot. + if (!is_snapshot_supported_) { + if (lock) { + mutex_.Unlock(); + } + delete s; + return nullptr; + } + auto snapshot_seq = last_seq_same_as_publish_seq_ + ? versions_->LastSequence() + : versions_->LastPublishedSequence(); + SnapshotImpl* snapshot = + snapshots_.New(s, snapshot_seq, unix_time, is_write_conflict_boundary); + if (lock) { + mutex_.Unlock(); + } + return snapshot; +} + +namespace { +typedef autovector CfdList; +bool CfdListContains(const CfdList& list, ColumnFamilyData* cfd) { + for (const ColumnFamilyData* t : list) { + if (t == cfd) { + return true; + } + } + return false; +} +} // namespace + +void DBImpl::ReleaseSnapshot(const Snapshot* s) { + const SnapshotImpl* casted_s = reinterpret_cast(s); + { + InstrumentedMutexLock l(&mutex_); + snapshots_.Delete(casted_s); + uint64_t oldest_snapshot; + if (snapshots_.empty()) { + oldest_snapshot = last_seq_same_as_publish_seq_ + ? versions_->LastSequence() + : versions_->LastPublishedSequence(); + } else { + oldest_snapshot = snapshots_.oldest()->number_; + } + // Avoid to go through every column family by checking a global threshold + // first. + if (oldest_snapshot > bottommost_files_mark_threshold_) { + CfdList cf_scheduled; + for (auto* cfd : *versions_->GetColumnFamilySet()) { + cfd->current()->storage_info()->UpdateOldestSnapshot(oldest_snapshot); + if (!cfd->current() + ->storage_info() + ->BottommostFilesMarkedForCompaction() + .empty()) { + SchedulePendingCompaction(cfd); + MaybeScheduleFlushOrCompaction(); + cf_scheduled.push_back(cfd); + } + } + + // Calculate a new threshold, skipping those CFs where compactions are + // scheduled. We do not do the same pass as the previous loop because + // mutex might be unlocked during the loop, making the result inaccurate. + SequenceNumber new_bottommost_files_mark_threshold = kMaxSequenceNumber; + for (auto* cfd : *versions_->GetColumnFamilySet()) { + if (CfdListContains(cf_scheduled, cfd)) { + continue; + } + new_bottommost_files_mark_threshold = std::min( + new_bottommost_files_mark_threshold, + cfd->current()->storage_info()->bottommost_files_mark_threshold()); + } + bottommost_files_mark_threshold_ = new_bottommost_files_mark_threshold; + } + } + delete casted_s; +} + +#ifndef ROCKSDB_LITE +Status DBImpl::GetPropertiesOfAllTables(ColumnFamilyHandle* column_family, + TablePropertiesCollection* props) { + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + + // Increment the ref count + mutex_.Lock(); + auto version = cfd->current(); + version->Ref(); + mutex_.Unlock(); + + auto s = version->GetPropertiesOfAllTables(props); + + // Decrement the ref count + mutex_.Lock(); + version->Unref(); + mutex_.Unlock(); + + return s; +} + +Status DBImpl::GetPropertiesOfTablesInRange(ColumnFamilyHandle* column_family, + const Range* range, std::size_t n, + TablePropertiesCollection* props) { + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + + // Increment the ref count + mutex_.Lock(); + auto version = cfd->current(); + version->Ref(); + mutex_.Unlock(); + + auto s = version->GetPropertiesOfTablesInRange(range, n, props); + + // Decrement the ref count + mutex_.Lock(); + version->Unref(); + mutex_.Unlock(); + + return s; +} + +#endif // ROCKSDB_LITE + +const std::string& DBImpl::GetName() const { return dbname_; } + +Env* DBImpl::GetEnv() const { return env_; } + +FileSystem* DB::GetFileSystem() const { + static LegacyFileSystemWrapper fs_wrap(GetEnv()); + return &fs_wrap; +} + +FileSystem* DBImpl::GetFileSystem() const { + return immutable_db_options_.fs.get(); +} + +Options DBImpl::GetOptions(ColumnFamilyHandle* column_family) const { + InstrumentedMutexLock l(&mutex_); + auto cfh = reinterpret_cast(column_family); + return Options(BuildDBOptions(immutable_db_options_, mutable_db_options_), + cfh->cfd()->GetLatestCFOptions()); +} + +DBOptions DBImpl::GetDBOptions() const { + InstrumentedMutexLock l(&mutex_); + return BuildDBOptions(immutable_db_options_, mutable_db_options_); +} + +bool DBImpl::GetProperty(ColumnFamilyHandle* column_family, + const Slice& property, std::string* value) { + const DBPropertyInfo* property_info = GetPropertyInfo(property); + value->clear(); + auto cfd = reinterpret_cast(column_family)->cfd(); + if (property_info == nullptr) { + return false; + } else if (property_info->handle_int) { + uint64_t int_value; + bool ret_value = + GetIntPropertyInternal(cfd, *property_info, false, &int_value); + if (ret_value) { + *value = ToString(int_value); + } + return ret_value; + } else if (property_info->handle_string) { + InstrumentedMutexLock l(&mutex_); + return cfd->internal_stats()->GetStringProperty(*property_info, property, + value); + } else if (property_info->handle_string_dbimpl) { + std::string tmp_value; + bool ret_value = (this->*(property_info->handle_string_dbimpl))(&tmp_value); + if (ret_value) { + *value = tmp_value; + } + return ret_value; + } + // Shouldn't reach here since exactly one of handle_string and handle_int + // should be non-nullptr. + assert(false); + return false; +} + +bool DBImpl::GetMapProperty(ColumnFamilyHandle* column_family, + const Slice& property, + std::map* value) { + const DBPropertyInfo* property_info = GetPropertyInfo(property); + value->clear(); + auto cfd = reinterpret_cast(column_family)->cfd(); + if (property_info == nullptr) { + return false; + } else if (property_info->handle_map) { + InstrumentedMutexLock l(&mutex_); + return cfd->internal_stats()->GetMapProperty(*property_info, property, + value); + } + // If we reach this point it means that handle_map is not provided for the + // requested property + return false; +} + +bool DBImpl::GetIntProperty(ColumnFamilyHandle* column_family, + const Slice& property, uint64_t* value) { + const DBPropertyInfo* property_info = GetPropertyInfo(property); + if (property_info == nullptr || property_info->handle_int == nullptr) { + return false; + } + auto cfd = reinterpret_cast(column_family)->cfd(); + return GetIntPropertyInternal(cfd, *property_info, false, value); +} + +bool DBImpl::GetIntPropertyInternal(ColumnFamilyData* cfd, + const DBPropertyInfo& property_info, + bool is_locked, uint64_t* value) { + assert(property_info.handle_int != nullptr); + if (!property_info.need_out_of_mutex) { + if (is_locked) { + mutex_.AssertHeld(); + return cfd->internal_stats()->GetIntProperty(property_info, value, this); + } else { + InstrumentedMutexLock l(&mutex_); + return cfd->internal_stats()->GetIntProperty(property_info, value, this); + } + } else { + SuperVersion* sv = nullptr; + if (!is_locked) { + sv = GetAndRefSuperVersion(cfd); + } else { + sv = cfd->GetSuperVersion(); + } + + bool ret = cfd->internal_stats()->GetIntPropertyOutOfMutex( + property_info, sv->current, value); + + if (!is_locked) { + ReturnAndCleanupSuperVersion(cfd, sv); + } + + return ret; + } +} + +bool DBImpl::GetPropertyHandleOptionsStatistics(std::string* value) { + assert(value != nullptr); + Statistics* statistics = immutable_db_options_.statistics.get(); + if (!statistics) { + return false; + } + *value = statistics->ToString(); + return true; +} + +#ifndef ROCKSDB_LITE +Status DBImpl::ResetStats() { + InstrumentedMutexLock l(&mutex_); + for (auto* cfd : *versions_->GetColumnFamilySet()) { + if (cfd->initialized()) { + cfd->internal_stats()->Clear(); + } + } + return Status::OK(); +} +#endif // ROCKSDB_LITE + +bool DBImpl::GetAggregatedIntProperty(const Slice& property, + uint64_t* aggregated_value) { + const DBPropertyInfo* property_info = GetPropertyInfo(property); + if (property_info == nullptr || property_info->handle_int == nullptr) { + return false; + } + + uint64_t sum = 0; + { + // Needs mutex to protect the list of column families. + InstrumentedMutexLock l(&mutex_); + uint64_t value; + for (auto* cfd : *versions_->GetColumnFamilySet()) { + if (!cfd->initialized()) { + continue; + } + if (GetIntPropertyInternal(cfd, *property_info, true, &value)) { + sum += value; + } else { + return false; + } + } + } + *aggregated_value = sum; + return true; +} + +SuperVersion* DBImpl::GetAndRefSuperVersion(ColumnFamilyData* cfd) { + // TODO(ljin): consider using GetReferencedSuperVersion() directly + return cfd->GetThreadLocalSuperVersion(this); +} + +// REQUIRED: this function should only be called on the write thread or if the +// mutex is held. +SuperVersion* DBImpl::GetAndRefSuperVersion(uint32_t column_family_id) { + auto column_family_set = versions_->GetColumnFamilySet(); + auto cfd = column_family_set->GetColumnFamily(column_family_id); + if (!cfd) { + return nullptr; + } + + return GetAndRefSuperVersion(cfd); +} + +void DBImpl::CleanupSuperVersion(SuperVersion* sv) { + // Release SuperVersion + if (sv->Unref()) { + bool defer_purge = + immutable_db_options().avoid_unnecessary_blocking_io; + { + InstrumentedMutexLock l(&mutex_); + sv->Cleanup(); + if (defer_purge) { + AddSuperVersionsToFreeQueue(sv); + SchedulePurge(); + } + } + if (!defer_purge) { + delete sv; + } + RecordTick(stats_, NUMBER_SUPERVERSION_CLEANUPS); + } + RecordTick(stats_, NUMBER_SUPERVERSION_RELEASES); +} + +void DBImpl::ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd, + SuperVersion* sv) { + if (!cfd->ReturnThreadLocalSuperVersion(sv)) { + CleanupSuperVersion(sv); + } +} + +// REQUIRED: this function should only be called on the write thread. +void DBImpl::ReturnAndCleanupSuperVersion(uint32_t column_family_id, + SuperVersion* sv) { + auto column_family_set = versions_->GetColumnFamilySet(); + auto cfd = column_family_set->GetColumnFamily(column_family_id); + + // If SuperVersion is held, and we successfully fetched a cfd using + // GetAndRefSuperVersion(), it must still exist. + assert(cfd != nullptr); + ReturnAndCleanupSuperVersion(cfd, sv); +} + +// REQUIRED: this function should only be called on the write thread or if the +// mutex is held. +ColumnFamilyHandle* DBImpl::GetColumnFamilyHandle(uint32_t column_family_id) { + ColumnFamilyMemTables* cf_memtables = column_family_memtables_.get(); + + if (!cf_memtables->Seek(column_family_id)) { + return nullptr; + } + + return cf_memtables->GetColumnFamilyHandle(); +} + +// REQUIRED: mutex is NOT held. +std::unique_ptr DBImpl::GetColumnFamilyHandleUnlocked( + uint32_t column_family_id) { + InstrumentedMutexLock l(&mutex_); + + auto* cfd = + versions_->GetColumnFamilySet()->GetColumnFamily(column_family_id); + if (cfd == nullptr) { + return nullptr; + } + + return std::unique_ptr( + new ColumnFamilyHandleImpl(cfd, this, &mutex_)); +} + +void DBImpl::GetApproximateMemTableStats(ColumnFamilyHandle* column_family, + const Range& range, + uint64_t* const count, + uint64_t* const size) { + ColumnFamilyHandleImpl* cfh = + reinterpret_cast(column_family); + ColumnFamilyData* cfd = cfh->cfd(); + SuperVersion* sv = GetAndRefSuperVersion(cfd); + + // Convert user_key into a corresponding internal key. + InternalKey k1(range.start, kMaxSequenceNumber, kValueTypeForSeek); + InternalKey k2(range.limit, kMaxSequenceNumber, kValueTypeForSeek); + MemTable::MemTableStats memStats = + sv->mem->ApproximateStats(k1.Encode(), k2.Encode()); + MemTable::MemTableStats immStats = + sv->imm->ApproximateStats(k1.Encode(), k2.Encode()); + *count = memStats.count + immStats.count; + *size = memStats.size + immStats.size; + + ReturnAndCleanupSuperVersion(cfd, sv); +} + +Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options, + ColumnFamilyHandle* column_family, + const Range* range, int n, uint64_t* sizes) { + if (!options.include_memtabtles && !options.include_files) { + return Status::InvalidArgument("Invalid options"); + } + + Version* v; + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + SuperVersion* sv = GetAndRefSuperVersion(cfd); + v = sv->current; + + for (int i = 0; i < n; i++) { + // Convert user_key into a corresponding internal key. + InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek); + InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek); + sizes[i] = 0; + if (options.include_files) { + sizes[i] += versions_->ApproximateSize( + options, v, k1.Encode(), k2.Encode(), /*start_level=*/0, + /*end_level=*/-1, TableReaderCaller::kUserApproximateSize); + } + if (options.include_memtabtles) { + sizes[i] += sv->mem->ApproximateStats(k1.Encode(), k2.Encode()).size; + sizes[i] += sv->imm->ApproximateStats(k1.Encode(), k2.Encode()).size; + } + } + + ReturnAndCleanupSuperVersion(cfd, sv); + return Status::OK(); +} + +std::list::iterator +DBImpl::CaptureCurrentFileNumberInPendingOutputs() { + // We need to remember the iterator of our insert, because after the + // background job is done, we need to remove that element from + // pending_outputs_. + pending_outputs_.push_back(versions_->current_next_file_number()); + auto pending_outputs_inserted_elem = pending_outputs_.end(); + --pending_outputs_inserted_elem; + return pending_outputs_inserted_elem; +} + +void DBImpl::ReleaseFileNumberFromPendingOutputs( + std::unique_ptr::iterator>& v) { + if (v.get() != nullptr) { + pending_outputs_.erase(*v.get()); + v.reset(); + } +} + +#ifndef ROCKSDB_LITE +Status DBImpl::GetUpdatesSince( + SequenceNumber seq, std::unique_ptr* iter, + const TransactionLogIterator::ReadOptions& read_options) { + RecordTick(stats_, GET_UPDATES_SINCE_CALLS); + if (seq > versions_->LastSequence()) { + return Status::NotFound("Requested sequence not yet written in the db"); + } + return wal_manager_.GetUpdatesSince(seq, iter, read_options, versions_.get()); +} + +Status DBImpl::DeleteFile(std::string name) { + uint64_t number; + FileType type; + WalFileType log_type; + if (!ParseFileName(name, &number, &type, &log_type) || + (type != kTableFile && type != kLogFile)) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, "DeleteFile %s failed.\n", + name.c_str()); + return Status::InvalidArgument("Invalid file name"); + } + + Status status; + if (type == kLogFile) { + // Only allow deleting archived log files + if (log_type != kArchivedLogFile) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "DeleteFile %s failed - not archived log.\n", + name.c_str()); + return Status::NotSupported("Delete only supported for archived logs"); + } + status = wal_manager_.DeleteFile(name, number); + if (!status.ok()) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "DeleteFile %s failed -- %s.\n", name.c_str(), + status.ToString().c_str()); + } + return status; + } + + int level; + FileMetaData* metadata; + ColumnFamilyData* cfd; + VersionEdit edit; + JobContext job_context(next_job_id_.fetch_add(1), true); + { + InstrumentedMutexLock l(&mutex_); + status = versions_->GetMetadataForFile(number, &level, &metadata, &cfd); + if (!status.ok()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "DeleteFile %s failed. File not found\n", name.c_str()); + job_context.Clean(); + return Status::InvalidArgument("File not found"); + } + assert(level < cfd->NumberLevels()); + + // If the file is being compacted no need to delete. + if (metadata->being_compacted) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "DeleteFile %s Skipped. File about to be compacted\n", + name.c_str()); + job_context.Clean(); + return Status::OK(); + } + + // Only the files in the last level can be deleted externally. + // This is to make sure that any deletion tombstones are not + // lost. Check that the level passed is the last level. + auto* vstoreage = cfd->current()->storage_info(); + for (int i = level + 1; i < cfd->NumberLevels(); i++) { + if (vstoreage->NumLevelFiles(i) != 0) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "DeleteFile %s FAILED. File not in last level\n", + name.c_str()); + job_context.Clean(); + return Status::InvalidArgument("File not in last level"); + } + } + // if level == 0, it has to be the oldest file + if (level == 0 && + vstoreage->LevelFiles(0).back()->fd.GetNumber() != number) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "DeleteFile %s failed ---" + " target file in level 0 must be the oldest.", + name.c_str()); + job_context.Clean(); + return Status::InvalidArgument("File in level 0, but not oldest"); + } + edit.SetColumnFamily(cfd->GetID()); + edit.DeleteFile(level, number); + status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), + &edit, &mutex_, directories_.GetDbDir()); + if (status.ok()) { + InstallSuperVersionAndScheduleWork(cfd, + &job_context.superversion_contexts[0], + *cfd->GetLatestMutableCFOptions()); + } + FindObsoleteFiles(&job_context, false); + } // lock released here + + LogFlush(immutable_db_options_.info_log); + // remove files outside the db-lock + if (job_context.HaveSomethingToDelete()) { + // Call PurgeObsoleteFiles() without holding mutex. + PurgeObsoleteFiles(job_context); + } + job_context.Clean(); + return status; +} + +Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family, + const RangePtr* ranges, size_t n, + bool include_end) { + Status status; + auto cfh = reinterpret_cast(column_family); + ColumnFamilyData* cfd = cfh->cfd(); + VersionEdit edit; + std::set deleted_files; + JobContext job_context(next_job_id_.fetch_add(1), true); + { + InstrumentedMutexLock l(&mutex_); + Version* input_version = cfd->current(); + + auto* vstorage = input_version->storage_info(); + for (size_t r = 0; r < n; r++) { + auto begin = ranges[r].start, end = ranges[r].limit; + for (int i = 1; i < cfd->NumberLevels(); i++) { + if (vstorage->LevelFiles(i).empty() || + !vstorage->OverlapInLevel(i, begin, end)) { + continue; + } + std::vector level_files; + InternalKey begin_storage, end_storage, *begin_key, *end_key; + if (begin == nullptr) { + begin_key = nullptr; + } else { + begin_storage.SetMinPossibleForUserKey(*begin); + begin_key = &begin_storage; + } + if (end == nullptr) { + end_key = nullptr; + } else { + end_storage.SetMaxPossibleForUserKey(*end); + end_key = &end_storage; + } + + vstorage->GetCleanInputsWithinInterval( + i, begin_key, end_key, &level_files, -1 /* hint_index */, + nullptr /* file_index */); + FileMetaData* level_file; + for (uint32_t j = 0; j < level_files.size(); j++) { + level_file = level_files[j]; + if (level_file->being_compacted) { + continue; + } + if (deleted_files.find(level_file) != deleted_files.end()) { + continue; + } + if (!include_end && end != nullptr && + cfd->user_comparator()->Compare(level_file->largest.user_key(), + *end) == 0) { + continue; + } + edit.SetColumnFamily(cfd->GetID()); + edit.DeleteFile(i, level_file->fd.GetNumber()); + deleted_files.insert(level_file); + level_file->being_compacted = true; + } + } + } + if (edit.GetDeletedFiles().empty()) { + job_context.Clean(); + return Status::OK(); + } + input_version->Ref(); + status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), + &edit, &mutex_, directories_.GetDbDir()); + if (status.ok()) { + InstallSuperVersionAndScheduleWork(cfd, + &job_context.superversion_contexts[0], + *cfd->GetLatestMutableCFOptions()); + } + for (auto* deleted_file : deleted_files) { + deleted_file->being_compacted = false; + } + input_version->Unref(); + FindObsoleteFiles(&job_context, false); + } // lock released here + + LogFlush(immutable_db_options_.info_log); + // remove files outside the db-lock + if (job_context.HaveSomethingToDelete()) { + // Call PurgeObsoleteFiles() without holding mutex. + PurgeObsoleteFiles(job_context); + } + job_context.Clean(); + return status; +} + +void DBImpl::GetLiveFilesMetaData(std::vector* metadata) { + InstrumentedMutexLock l(&mutex_); + versions_->GetLiveFilesMetaData(metadata); +} + +void DBImpl::GetColumnFamilyMetaData(ColumnFamilyHandle* column_family, + ColumnFamilyMetaData* cf_meta) { + assert(column_family); + auto* cfd = reinterpret_cast(column_family)->cfd(); + auto* sv = GetAndRefSuperVersion(cfd); + { + // Without mutex, Version::GetColumnFamilyMetaData will have data race with + // Compaction::MarkFilesBeingCompacted. One solution is to use mutex, but + // this may cause regression. An alternative is to make + // FileMetaData::being_compacted atomic, but it will make FileMetaData + // non-copy-able. Another option is to separate these variables from + // original FileMetaData struct, and this requires re-organization of data + // structures. For now, we take the easy approach. If + // DB::GetColumnFamilyMetaData is not called frequently, the regression + // should not be big. We still need to keep an eye on it. + InstrumentedMutexLock l(&mutex_); + sv->current->GetColumnFamilyMetaData(cf_meta); + } + ReturnAndCleanupSuperVersion(cfd, sv); +} + +#endif // ROCKSDB_LITE + +Status DBImpl::CheckConsistency() { + mutex_.AssertHeld(); + std::vector metadata; + versions_->GetLiveFilesMetaData(&metadata); + TEST_SYNC_POINT("DBImpl::CheckConsistency:AfterGetLiveFilesMetaData"); + + std::string corruption_messages; + + if (immutable_db_options_.skip_checking_sst_file_sizes_on_db_open) { + // Instead of calling GetFileSize() for each expected file, call + // GetChildren() for the DB directory and check that all expected files + // are listed, without checking their sizes. + // Since sst files might be in different directories, do it for each + // directory separately. + std::map> files_by_directory; + for (const auto& md : metadata) { + // md.name has a leading "/". Remove it. + std::string fname = md.name; + if (!fname.empty() && fname[0] == '/') { + fname = fname.substr(1); + } + files_by_directory[md.db_path].push_back(fname); + } + for (const auto& dir_files : files_by_directory) { + std::string directory = dir_files.first; + std::vector existing_files; + Status s = env_->GetChildren(directory, &existing_files); + if (!s.ok()) { + corruption_messages += + "Can't list files in " + directory + ": " + s.ToString() + "\n"; + continue; + } + std::sort(existing_files.begin(), existing_files.end()); + + for (const std::string& fname : dir_files.second) { + if (!std::binary_search(existing_files.begin(), existing_files.end(), + fname) && + !std::binary_search(existing_files.begin(), existing_files.end(), + Rocks2LevelTableFileName(fname))) { + corruption_messages += + "Missing sst file " + fname + " in " + directory + "\n"; + } + } + } + } else { + for (const auto& md : metadata) { + // md.name has a leading "/". + std::string file_path = md.db_path + md.name; + + uint64_t fsize = 0; + TEST_SYNC_POINT("DBImpl::CheckConsistency:BeforeGetFileSize"); + Status s = env_->GetFileSize(file_path, &fsize); + if (!s.ok() && + env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok()) { + s = Status::OK(); + } + if (!s.ok()) { + corruption_messages += + "Can't access " + md.name + ": " + s.ToString() + "\n"; + } else if (fsize != md.size) { + corruption_messages += "Sst file size mismatch: " + file_path + + ". Size recorded in manifest " + + ToString(md.size) + ", actual size " + + ToString(fsize) + "\n"; + } + } + } + + if (corruption_messages.size() == 0) { + return Status::OK(); + } else { + return Status::Corruption(corruption_messages); + } +} + +Status DBImpl::GetDbIdentity(std::string& identity) const { + identity.assign(db_id_); + return Status::OK(); +} + +Status DBImpl::GetDbIdentityFromIdentityFile(std::string* identity) const { + std::string idfilename = IdentityFileName(dbname_); + const FileOptions soptions; + + Status s = ReadFileToString(fs_.get(), idfilename, identity); + if (!s.ok()) { + return s; + } + + // If last character is '\n' remove it from identity + if (identity->size() > 0 && identity->back() == '\n') { + identity->pop_back(); + } + return s; +} + +// Default implementation -- returns not supported status +Status DB::CreateColumnFamily(const ColumnFamilyOptions& /*cf_options*/, + const std::string& /*column_family_name*/, + ColumnFamilyHandle** /*handle*/) { + return Status::NotSupported(""); +} + +Status DB::CreateColumnFamilies( + const ColumnFamilyOptions& /*cf_options*/, + const std::vector& /*column_family_names*/, + std::vector* /*handles*/) { + return Status::NotSupported(""); +} + +Status DB::CreateColumnFamilies( + const std::vector& /*column_families*/, + std::vector* /*handles*/) { + return Status::NotSupported(""); +} + +Status DB::DropColumnFamily(ColumnFamilyHandle* /*column_family*/) { + return Status::NotSupported(""); +} + +Status DB::DropColumnFamilies( + const std::vector& /*column_families*/) { + return Status::NotSupported(""); +} + +Status DB::DestroyColumnFamilyHandle(ColumnFamilyHandle* column_family) { + delete column_family; + return Status::OK(); +} + +DB::~DB() {} + +Status DBImpl::Close() { + if (!closed_) { + { + InstrumentedMutexLock l(&mutex_); + // If there is unreleased snapshot, fail the close call + if (!snapshots_.empty()) { + return Status::Aborted("Cannot close DB with unreleased snapshot."); + } + } + + closed_ = true; + return CloseImpl(); + } + return Status::OK(); +} + +Status DB::ListColumnFamilies(const DBOptions& db_options, + const std::string& name, + std::vector* column_families) { + FileSystem* fs = db_options.file_system.get(); + LegacyFileSystemWrapper legacy_fs(db_options.env); + if (!fs) { + fs = &legacy_fs; + } + return VersionSet::ListColumnFamilies(column_families, name, fs); +} + +Snapshot::~Snapshot() {} + +Status DestroyDB(const std::string& dbname, const Options& options, + const std::vector& column_families) { + ImmutableDBOptions soptions(SanitizeOptions(dbname, options)); + Env* env = soptions.env; + std::vector filenames; + bool wal_in_db_path = IsWalDirSameAsDBPath(&soptions); + + // Reset the logger because it holds a handle to the + // log file and prevents cleanup and directory removal + soptions.info_log.reset(); + // Ignore error in case directory does not exist + env->GetChildren(dbname, &filenames); + + FileLock* lock; + const std::string lockname = LockFileName(dbname); + Status result = env->LockFile(lockname, &lock); + if (result.ok()) { + uint64_t number; + FileType type; + InfoLogPrefix info_log_prefix(!soptions.db_log_dir.empty(), dbname); + for (const auto& fname : filenames) { + if (ParseFileName(fname, &number, info_log_prefix.prefix, &type) && + type != kDBLockFile) { // Lock file will be deleted at end + Status del; + std::string path_to_delete = dbname + "/" + fname; + if (type == kMetaDatabase) { + del = DestroyDB(path_to_delete, options); + } else if (type == kTableFile || type == kLogFile) { + del = DeleteDBFile(&soptions, path_to_delete, dbname, + /*force_bg=*/false, /*force_fg=*/!wal_in_db_path); + } else { + del = env->DeleteFile(path_to_delete); + } + if (result.ok() && !del.ok()) { + result = del; + } + } + } + + std::vector paths; + + for (const auto& path : options.db_paths) { + paths.emplace_back(path.path); + } + for (const auto& cf : column_families) { + for (const auto& path : cf.options.cf_paths) { + paths.emplace_back(path.path); + } + } + + // Remove duplicate paths. + // Note that we compare only the actual paths but not path ids. + // This reason is that same path can appear at different path_ids + // for different column families. + std::sort(paths.begin(), paths.end()); + paths.erase(std::unique(paths.begin(), paths.end()), paths.end()); + + for (const auto& path : paths) { + if (env->GetChildren(path, &filenames).ok()) { + for (const auto& fname : filenames) { + if (ParseFileName(fname, &number, &type) && + type == kTableFile) { // Lock file will be deleted at end + std::string table_path = path + "/" + fname; + Status del = DeleteDBFile(&soptions, table_path, dbname, + /*force_bg=*/false, /*force_fg=*/false); + if (result.ok() && !del.ok()) { + result = del; + } + } + } + env->DeleteDir(path); + } + } + + std::vector walDirFiles; + std::string archivedir = ArchivalDirectory(dbname); + bool wal_dir_exists = false; + if (dbname != soptions.wal_dir) { + wal_dir_exists = env->GetChildren(soptions.wal_dir, &walDirFiles).ok(); + archivedir = ArchivalDirectory(soptions.wal_dir); + } + + // Archive dir may be inside wal dir or dbname and should be + // processed and removed before those otherwise we have issues + // removing them + std::vector archiveFiles; + if (env->GetChildren(archivedir, &archiveFiles).ok()) { + // Delete archival files. + for (const auto& file : archiveFiles) { + if (ParseFileName(file, &number, &type) && type == kLogFile) { + Status del = + DeleteDBFile(&soptions, archivedir + "/" + file, archivedir, + /*force_bg=*/false, /*force_fg=*/!wal_in_db_path); + if (result.ok() && !del.ok()) { + result = del; + } + } + } + env->DeleteDir(archivedir); + } + + // Delete log files in the WAL dir + if (wal_dir_exists) { + for (const auto& file : walDirFiles) { + if (ParseFileName(file, &number, &type) && type == kLogFile) { + Status del = + DeleteDBFile(&soptions, LogFileName(soptions.wal_dir, number), + soptions.wal_dir, /*force_bg=*/false, + /*force_fg=*/!wal_in_db_path); + if (result.ok() && !del.ok()) { + result = del; + } + } + } + env->DeleteDir(soptions.wal_dir); + } + + env->UnlockFile(lock); // Ignore error since state is already gone + env->DeleteFile(lockname); + + // sst_file_manager holds a ref to the logger. Make sure the logger is + // gone before trying to remove the directory. + soptions.sst_file_manager.reset(); + + env->DeleteDir(dbname); // Ignore error in case dir contains other files + } + return result; +} + +Status DBImpl::WriteOptionsFile(bool need_mutex_lock, + bool need_enter_write_thread) { +#ifndef ROCKSDB_LITE + WriteThread::Writer w; + if (need_mutex_lock) { + mutex_.Lock(); + } else { + mutex_.AssertHeld(); + } + if (need_enter_write_thread) { + write_thread_.EnterUnbatched(&w, &mutex_); + } + + std::vector cf_names; + std::vector cf_opts; + + // This part requires mutex to protect the column family options + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->IsDropped()) { + continue; + } + cf_names.push_back(cfd->GetName()); + cf_opts.push_back(cfd->GetLatestCFOptions()); + } + + // Unlock during expensive operations. New writes cannot get here + // because the single write thread ensures all new writes get queued. + DBOptions db_options = + BuildDBOptions(immutable_db_options_, mutable_db_options_); + mutex_.Unlock(); + + TEST_SYNC_POINT("DBImpl::WriteOptionsFile:1"); + TEST_SYNC_POINT("DBImpl::WriteOptionsFile:2"); + + std::string file_name = + TempOptionsFileName(GetName(), versions_->NewFileNumber()); + Status s = PersistRocksDBOptions(db_options, cf_names, cf_opts, file_name, + GetFileSystem()); + + if (s.ok()) { + s = RenameTempFileToOptionsFile(file_name); + } + // restore lock + if (!need_mutex_lock) { + mutex_.Lock(); + } + if (need_enter_write_thread) { + write_thread_.ExitUnbatched(&w); + } + if (!s.ok()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Unnable to persist options -- %s", s.ToString().c_str()); + if (immutable_db_options_.fail_if_options_file_error) { + return Status::IOError("Unable to persist options.", + s.ToString().c_str()); + } + } +#else + (void)need_mutex_lock; + (void)need_enter_write_thread; +#endif // !ROCKSDB_LITE + return Status::OK(); +} + +#ifndef ROCKSDB_LITE +namespace { +void DeleteOptionsFilesHelper(const std::map& filenames, + const size_t num_files_to_keep, + const std::shared_ptr& info_log, + Env* env) { + if (filenames.size() <= num_files_to_keep) { + return; + } + for (auto iter = std::next(filenames.begin(), num_files_to_keep); + iter != filenames.end(); ++iter) { + if (!env->DeleteFile(iter->second).ok()) { + ROCKS_LOG_WARN(info_log, "Unable to delete options file %s", + iter->second.c_str()); + } + } +} +} // namespace +#endif // !ROCKSDB_LITE + +Status DBImpl::DeleteObsoleteOptionsFiles() { +#ifndef ROCKSDB_LITE + std::vector filenames; + // use ordered map to store keep the filenames sorted from the newest + // to the oldest. + std::map options_filenames; + Status s; + s = GetEnv()->GetChildren(GetName(), &filenames); + if (!s.ok()) { + return s; + } + for (auto& filename : filenames) { + uint64_t file_number; + FileType type; + if (ParseFileName(filename, &file_number, &type) && type == kOptionsFile) { + options_filenames.insert( + {std::numeric_limits::max() - file_number, + GetName() + "/" + filename}); + } + } + + // Keeps the latest 2 Options file + const size_t kNumOptionsFilesKept = 2; + DeleteOptionsFilesHelper(options_filenames, kNumOptionsFilesKept, + immutable_db_options_.info_log, GetEnv()); + return Status::OK(); +#else + return Status::OK(); +#endif // !ROCKSDB_LITE +} + +Status DBImpl::RenameTempFileToOptionsFile(const std::string& file_name) { +#ifndef ROCKSDB_LITE + Status s; + + uint64_t options_file_number = versions_->NewFileNumber(); + std::string options_file_name = + OptionsFileName(GetName(), options_file_number); + // Retry if the file name happen to conflict with an existing one. + s = GetEnv()->RenameFile(file_name, options_file_name); + if (s.ok()) { + InstrumentedMutexLock l(&mutex_); + versions_->options_file_number_ = options_file_number; + } + + if (0 == disable_delete_obsolete_files_) { + DeleteObsoleteOptionsFiles(); + } + return s; +#else + (void)file_name; + return Status::OK(); +#endif // !ROCKSDB_LITE +} + +#ifdef ROCKSDB_USING_THREAD_STATUS + +void DBImpl::NewThreadStatusCfInfo(ColumnFamilyData* cfd) const { + if (immutable_db_options_.enable_thread_tracking) { + ThreadStatusUtil::NewColumnFamilyInfo(this, cfd, cfd->GetName(), + cfd->ioptions()->env); + } +} + +void DBImpl::EraseThreadStatusCfInfo(ColumnFamilyData* cfd) const { + if (immutable_db_options_.enable_thread_tracking) { + ThreadStatusUtil::EraseColumnFamilyInfo(cfd); + } +} + +void DBImpl::EraseThreadStatusDbInfo() const { + if (immutable_db_options_.enable_thread_tracking) { + ThreadStatusUtil::EraseDatabaseInfo(this); + } +} + +#else +void DBImpl::NewThreadStatusCfInfo(ColumnFamilyData* /*cfd*/) const {} + +void DBImpl::EraseThreadStatusCfInfo(ColumnFamilyData* /*cfd*/) const {} + +void DBImpl::EraseThreadStatusDbInfo() const {} +#endif // ROCKSDB_USING_THREAD_STATUS + +// +// A global method that can dump out the build version +void DumpRocksDBBuildVersion(Logger* log) { +#if !defined(IOS_CROSS_COMPILE) + // if we compile with Xcode, we don't run build_detect_version, so we don't + // generate util/build_version.cc + ROCKS_LOG_HEADER(log, "RocksDB version: %d.%d.%d\n", ROCKSDB_MAJOR, + ROCKSDB_MINOR, ROCKSDB_PATCH); + ROCKS_LOG_HEADER(log, "Git sha %s", rocksdb_build_git_sha); + ROCKS_LOG_HEADER(log, "Compile date %s", rocksdb_build_compile_date); +#else + (void)log; // ignore "-Wunused-parameter" +#endif +} + +#ifndef ROCKSDB_LITE +SequenceNumber DBImpl::GetEarliestMemTableSequenceNumber(SuperVersion* sv, + bool include_history) { + // Find the earliest sequence number that we know we can rely on reading + // from the memtable without needing to check sst files. + SequenceNumber earliest_seq = + sv->imm->GetEarliestSequenceNumber(include_history); + if (earliest_seq == kMaxSequenceNumber) { + earliest_seq = sv->mem->GetEarliestSequenceNumber(); + } + assert(sv->mem->GetEarliestSequenceNumber() >= earliest_seq); + + return earliest_seq; +} +#endif // ROCKSDB_LITE + +#ifndef ROCKSDB_LITE +Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key, + bool cache_only, + SequenceNumber lower_bound_seq, + SequenceNumber* seq, + bool* found_record_for_key, + bool* is_blob_index) { + Status s; + MergeContext merge_context; + SequenceNumber max_covering_tombstone_seq = 0; + + ReadOptions read_options; + SequenceNumber current_seq = versions_->LastSequence(); + LookupKey lkey(key, current_seq); + + *seq = kMaxSequenceNumber; + *found_record_for_key = false; + + // Check if there is a record for this key in the latest memtable + sv->mem->Get(lkey, nullptr, &s, &merge_context, &max_covering_tombstone_seq, + seq, read_options, nullptr /*read_callback*/, is_blob_index); + + if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) { + // unexpected error reading memtable. + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Unexpected status returned from MemTable::Get: %s\n", + s.ToString().c_str()); + + return s; + } + + if (*seq != kMaxSequenceNumber) { + // Found a sequence number, no need to check immutable memtables + *found_record_for_key = true; + return Status::OK(); + } + + SequenceNumber lower_bound_in_mem = sv->mem->GetEarliestSequenceNumber(); + if (lower_bound_in_mem != kMaxSequenceNumber && + lower_bound_in_mem < lower_bound_seq) { + *found_record_for_key = false; + return Status::OK(); + } + + // Check if there is a record for this key in the immutable memtables + sv->imm->Get(lkey, nullptr, &s, &merge_context, &max_covering_tombstone_seq, + seq, read_options, nullptr /*read_callback*/, is_blob_index); + + if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) { + // unexpected error reading memtable. + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Unexpected status returned from MemTableList::Get: %s\n", + s.ToString().c_str()); + + return s; + } + + if (*seq != kMaxSequenceNumber) { + // Found a sequence number, no need to check memtable history + *found_record_for_key = true; + return Status::OK(); + } + + SequenceNumber lower_bound_in_imm = sv->imm->GetEarliestSequenceNumber(); + if (lower_bound_in_imm != kMaxSequenceNumber && + lower_bound_in_imm < lower_bound_seq) { + *found_record_for_key = false; + return Status::OK(); + } + + // Check if there is a record for this key in the immutable memtables + sv->imm->GetFromHistory(lkey, nullptr, &s, &merge_context, + &max_covering_tombstone_seq, seq, read_options, + is_blob_index); + + if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) { + // unexpected error reading memtable. + ROCKS_LOG_ERROR( + immutable_db_options_.info_log, + "Unexpected status returned from MemTableList::GetFromHistory: %s\n", + s.ToString().c_str()); + + return s; + } + + if (*seq != kMaxSequenceNumber) { + // Found a sequence number, no need to check SST files + *found_record_for_key = true; + return Status::OK(); + } + + // We could do a sv->imm->GetEarliestSequenceNumber(/*include_history*/ true) + // check here to skip the history if possible. But currently the caller + // already does that. Maybe we should move the logic here later. + + // TODO(agiardullo): possible optimization: consider checking cached + // SST files if cache_only=true? + if (!cache_only) { + // Check tables + sv->current->Get(read_options, lkey, nullptr, &s, &merge_context, + &max_covering_tombstone_seq, nullptr /* value_found */, + found_record_for_key, seq, nullptr /*read_callback*/, + is_blob_index); + + if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) { + // unexpected error reading SST files + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Unexpected status returned from Version::Get: %s\n", + s.ToString().c_str()); + } + } + + return s; +} + +Status DBImpl::IngestExternalFile( + ColumnFamilyHandle* column_family, + const std::vector& external_files, + const IngestExternalFileOptions& ingestion_options) { + IngestExternalFileArg arg; + arg.column_family = column_family; + arg.external_files = external_files; + arg.options = ingestion_options; + return IngestExternalFiles({arg}); +} + +Status DBImpl::IngestExternalFiles( + const std::vector& args) { + if (args.empty()) { + return Status::InvalidArgument("ingestion arg list is empty"); + } + { + std::unordered_set unique_cfhs; + for (const auto& arg : args) { + if (arg.column_family == nullptr) { + return Status::InvalidArgument("column family handle is null"); + } else if (unique_cfhs.count(arg.column_family) > 0) { + return Status::InvalidArgument( + "ingestion args have duplicate column families"); + } + unique_cfhs.insert(arg.column_family); + } + } + // Ingest multiple external SST files atomically. + size_t num_cfs = args.size(); + for (size_t i = 0; i != num_cfs; ++i) { + if (args[i].external_files.empty()) { + char err_msg[128] = {0}; + snprintf(err_msg, 128, "external_files[%zu] is empty", i); + return Status::InvalidArgument(err_msg); + } + } + for (const auto& arg : args) { + const IngestExternalFileOptions& ingest_opts = arg.options; + if (ingest_opts.ingest_behind && + !immutable_db_options_.allow_ingest_behind) { + return Status::InvalidArgument( + "can't ingest_behind file in DB with allow_ingest_behind=false"); + } + } + + // TODO (yanqin) maybe handle the case in which column_families have + // duplicates + std::unique_ptr::iterator> pending_output_elem; + size_t total = 0; + for (const auto& arg : args) { + total += arg.external_files.size(); + } + uint64_t next_file_number = 0; + Status status = ReserveFileNumbersBeforeIngestion( + static_cast(args[0].column_family)->cfd(), total, + pending_output_elem, &next_file_number); + if (!status.ok()) { + InstrumentedMutexLock l(&mutex_); + ReleaseFileNumberFromPendingOutputs(pending_output_elem); + return status; + } + + std::vector ingestion_jobs; + for (const auto& arg : args) { + auto* cfd = static_cast(arg.column_family)->cfd(); + ingestion_jobs.emplace_back( + env_, versions_.get(), cfd, immutable_db_options_, file_options_, + &snapshots_, arg.options, &directories_, &event_logger_); + } + std::vector> exec_results; + for (size_t i = 0; i != num_cfs; ++i) { + exec_results.emplace_back(false, Status::OK()); + } + // TODO(yanqin) maybe make jobs run in parallel + uint64_t start_file_number = next_file_number; + for (size_t i = 1; i != num_cfs; ++i) { + start_file_number += args[i - 1].external_files.size(); + auto* cfd = + static_cast(args[i].column_family)->cfd(); + SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); + exec_results[i].second = ingestion_jobs[i].Prepare( + args[i].external_files, start_file_number, super_version); + exec_results[i].first = true; + CleanupSuperVersion(super_version); + } + TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeLastJobPrepare:0"); + TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeLastJobPrepare:1"); + { + auto* cfd = + static_cast(args[0].column_family)->cfd(); + SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); + exec_results[0].second = ingestion_jobs[0].Prepare( + args[0].external_files, next_file_number, super_version); + exec_results[0].first = true; + CleanupSuperVersion(super_version); + } + for (const auto& exec_result : exec_results) { + if (!exec_result.second.ok()) { + status = exec_result.second; + break; + } + } + if (!status.ok()) { + for (size_t i = 0; i != num_cfs; ++i) { + if (exec_results[i].first) { + ingestion_jobs[i].Cleanup(status); + } + } + InstrumentedMutexLock l(&mutex_); + ReleaseFileNumberFromPendingOutputs(pending_output_elem); + return status; + } + + std::vector sv_ctxs; + for (size_t i = 0; i != num_cfs; ++i) { + sv_ctxs.emplace_back(true /* create_superversion */); + } + TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeJobsRun:0"); + TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeJobsRun:1"); + TEST_SYNC_POINT("DBImpl::AddFile:Start"); + { + InstrumentedMutexLock l(&mutex_); + TEST_SYNC_POINT("DBImpl::AddFile:MutexLock"); + + // Stop writes to the DB by entering both write threads + WriteThread::Writer w; + write_thread_.EnterUnbatched(&w, &mutex_); + WriteThread::Writer nonmem_w; + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + } + + // When unordered_write is enabled, the keys are writing to memtable in an + // unordered way. If the ingestion job checks memtable key range before the + // key landing in memtable, the ingestion job may skip the necessary + // memtable flush. + // So wait here to ensure there is no pending write to memtable. + WaitForPendingWrites(); + + num_running_ingest_file_ += static_cast(num_cfs); + TEST_SYNC_POINT("DBImpl::IngestExternalFile:AfterIncIngestFileCounter"); + + bool at_least_one_cf_need_flush = false; + std::vector need_flush(num_cfs, false); + for (size_t i = 0; i != num_cfs; ++i) { + auto* cfd = + static_cast(args[i].column_family)->cfd(); + if (cfd->IsDropped()) { + // TODO (yanqin) investigate whether we should abort ingestion or + // proceed with other non-dropped column families. + status = Status::InvalidArgument( + "cannot ingest an external file into a dropped CF"); + break; + } + bool tmp = false; + status = ingestion_jobs[i].NeedsFlush(&tmp, cfd->GetSuperVersion()); + need_flush[i] = tmp; + at_least_one_cf_need_flush = (at_least_one_cf_need_flush || tmp); + if (!status.ok()) { + break; + } + } + TEST_SYNC_POINT_CALLBACK("DBImpl::IngestExternalFile:NeedFlush", + &at_least_one_cf_need_flush); + + if (status.ok() && at_least_one_cf_need_flush) { + FlushOptions flush_opts; + flush_opts.allow_write_stall = true; + if (immutable_db_options_.atomic_flush) { + autovector cfds_to_flush; + SelectColumnFamiliesForAtomicFlush(&cfds_to_flush); + mutex_.Unlock(); + status = AtomicFlushMemTables(cfds_to_flush, flush_opts, + FlushReason::kExternalFileIngestion, + true /* writes_stopped */); + mutex_.Lock(); + } else { + for (size_t i = 0; i != num_cfs; ++i) { + if (need_flush[i]) { + mutex_.Unlock(); + auto* cfd = + static_cast(args[i].column_family) + ->cfd(); + status = FlushMemTable(cfd, flush_opts, + FlushReason::kExternalFileIngestion, + true /* writes_stopped */); + mutex_.Lock(); + if (!status.ok()) { + break; + } + } + } + } + } + // Run ingestion jobs. + if (status.ok()) { + for (size_t i = 0; i != num_cfs; ++i) { + status = ingestion_jobs[i].Run(); + if (!status.ok()) { + break; + } + } + } + if (status.ok()) { + int consumed_seqno_count = + ingestion_jobs[0].ConsumedSequenceNumbersCount(); +#ifndef NDEBUG + for (size_t i = 1; i != num_cfs; ++i) { + assert(!!consumed_seqno_count == + !!ingestion_jobs[i].ConsumedSequenceNumbersCount()); + consumed_seqno_count += + ingestion_jobs[i].ConsumedSequenceNumbersCount(); + } +#endif + if (consumed_seqno_count > 0) { + const SequenceNumber last_seqno = versions_->LastSequence(); + versions_->SetLastAllocatedSequence(last_seqno + consumed_seqno_count); + versions_->SetLastPublishedSequence(last_seqno + consumed_seqno_count); + versions_->SetLastSequence(last_seqno + consumed_seqno_count); + } + autovector cfds_to_commit; + autovector mutable_cf_options_list; + autovector> edit_lists; + uint32_t num_entries = 0; + for (size_t i = 0; i != num_cfs; ++i) { + auto* cfd = + static_cast(args[i].column_family)->cfd(); + if (cfd->IsDropped()) { + continue; + } + cfds_to_commit.push_back(cfd); + mutable_cf_options_list.push_back(cfd->GetLatestMutableCFOptions()); + autovector edit_list; + edit_list.push_back(ingestion_jobs[i].edit()); + edit_lists.push_back(edit_list); + ++num_entries; + } + // Mark the version edits as an atomic group if the number of version + // edits exceeds 1. + if (cfds_to_commit.size() > 1) { + for (auto& edits : edit_lists) { + assert(edits.size() == 1); + edits[0]->MarkAtomicGroup(--num_entries); + } + assert(0 == num_entries); + } + status = + versions_->LogAndApply(cfds_to_commit, mutable_cf_options_list, + edit_lists, &mutex_, directories_.GetDbDir()); + } + + if (status.ok()) { + for (size_t i = 0; i != num_cfs; ++i) { + auto* cfd = + static_cast(args[i].column_family)->cfd(); + if (!cfd->IsDropped()) { + InstallSuperVersionAndScheduleWork(cfd, &sv_ctxs[i], + *cfd->GetLatestMutableCFOptions()); +#ifndef NDEBUG + if (0 == i && num_cfs > 1) { + TEST_SYNC_POINT( + "DBImpl::IngestExternalFiles:InstallSVForFirstCF:0"); + TEST_SYNC_POINT( + "DBImpl::IngestExternalFiles:InstallSVForFirstCF:1"); + } +#endif // !NDEBUG + } + } + } + + // Resume writes to the DB + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } + write_thread_.ExitUnbatched(&w); + + if (status.ok()) { + for (auto& job : ingestion_jobs) { + job.UpdateStats(); + } + } + ReleaseFileNumberFromPendingOutputs(pending_output_elem); + num_running_ingest_file_ -= static_cast(num_cfs); + if (0 == num_running_ingest_file_) { + bg_cv_.SignalAll(); + } + TEST_SYNC_POINT("DBImpl::AddFile:MutexUnlock"); + } + // mutex_ is unlocked here + + // Cleanup + for (size_t i = 0; i != num_cfs; ++i) { + sv_ctxs[i].Clean(); + // This may rollback jobs that have completed successfully. This is + // intended for atomicity. + ingestion_jobs[i].Cleanup(status); + } + if (status.ok()) { + for (size_t i = 0; i != num_cfs; ++i) { + auto* cfd = + static_cast(args[i].column_family)->cfd(); + if (!cfd->IsDropped()) { + NotifyOnExternalFileIngested(cfd, ingestion_jobs[i]); + } + } + } + return status; +} + +Status DBImpl::CreateColumnFamilyWithImport( + const ColumnFamilyOptions& options, const std::string& column_family_name, + const ImportColumnFamilyOptions& import_options, + const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle) { + assert(handle != nullptr); + assert(*handle == nullptr); + std::string cf_comparator_name = options.comparator->Name(); + if (cf_comparator_name != metadata.db_comparator_name) { + return Status::InvalidArgument("Comparator name mismatch"); + } + + // Create column family. + auto status = CreateColumnFamily(options, column_family_name, handle); + if (!status.ok()) { + return status; + } + + // Import sst files from metadata. + auto cfh = reinterpret_cast(*handle); + auto cfd = cfh->cfd(); + ImportColumnFamilyJob import_job(env_, versions_.get(), cfd, + immutable_db_options_, file_options_, + import_options, metadata.files); + + SuperVersionContext dummy_sv_ctx(/* create_superversion */ true); + VersionEdit dummy_edit; + uint64_t next_file_number = 0; + std::unique_ptr::iterator> pending_output_elem; + { + // Lock db mutex + InstrumentedMutexLock l(&mutex_); + if (error_handler_.IsDBStopped()) { + // Don't import files when there is a bg_error + status = error_handler_.GetBGError(); + } + + // Make sure that bg cleanup wont delete the files that we are importing + pending_output_elem.reset(new std::list::iterator( + CaptureCurrentFileNumberInPendingOutputs())); + + if (status.ok()) { + // If crash happen after a hard link established, Recover function may + // reuse the file number that has already assigned to the internal file, + // and this will overwrite the external file. To protect the external + // file, we have to make sure the file number will never being reused. + next_file_number = versions_->FetchAddFileNumber(metadata.files.size()); + auto cf_options = cfd->GetLatestMutableCFOptions(); + status = versions_->LogAndApply(cfd, *cf_options, &dummy_edit, &mutex_, + directories_.GetDbDir()); + if (status.ok()) { + InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options); + } + } + } + dummy_sv_ctx.Clean(); + + if (status.ok()) { + SuperVersion* sv = cfd->GetReferencedSuperVersion(this); + status = import_job.Prepare(next_file_number, sv); + CleanupSuperVersion(sv); + } + + if (status.ok()) { + SuperVersionContext sv_context(true /*create_superversion*/); + { + // Lock db mutex + InstrumentedMutexLock l(&mutex_); + + // Stop writes to the DB by entering both write threads + WriteThread::Writer w; + write_thread_.EnterUnbatched(&w, &mutex_); + WriteThread::Writer nonmem_w; + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + } + + num_running_ingest_file_++; + assert(!cfd->IsDropped()); + status = import_job.Run(); + + // Install job edit [Mutex will be unlocked here] + if (status.ok()) { + auto cf_options = cfd->GetLatestMutableCFOptions(); + status = versions_->LogAndApply(cfd, *cf_options, import_job.edit(), + &mutex_, directories_.GetDbDir()); + if (status.ok()) { + InstallSuperVersionAndScheduleWork(cfd, &sv_context, *cf_options); + } + } + + // Resume writes to the DB + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } + write_thread_.ExitUnbatched(&w); + + num_running_ingest_file_--; + if (num_running_ingest_file_ == 0) { + bg_cv_.SignalAll(); + } + } + // mutex_ is unlocked here + + sv_context.Clean(); + } + + { + InstrumentedMutexLock l(&mutex_); + ReleaseFileNumberFromPendingOutputs(pending_output_elem); + } + + import_job.Cleanup(status); + if (!status.ok()) { + DropColumnFamily(*handle); + DestroyColumnFamilyHandle(*handle); + *handle = nullptr; + } + return status; +} + +Status DBImpl::VerifyChecksum(const ReadOptions& read_options) { + Status s; + std::vector cfd_list; + { + InstrumentedMutexLock l(&mutex_); + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (!cfd->IsDropped() && cfd->initialized()) { + cfd->Ref(); + cfd_list.push_back(cfd); + } + } + } + std::vector sv_list; + for (auto cfd : cfd_list) { + sv_list.push_back(cfd->GetReferencedSuperVersion(this)); + } + for (auto& sv : sv_list) { + VersionStorageInfo* vstorage = sv->current->storage_info(); + ColumnFamilyData* cfd = sv->current->cfd(); + Options opts; + { + InstrumentedMutexLock l(&mutex_); + opts = Options(BuildDBOptions(immutable_db_options_, mutable_db_options_), + cfd->GetLatestCFOptions()); + } + for (int i = 0; i < vstorage->num_non_empty_levels() && s.ok(); i++) { + for (size_t j = 0; j < vstorage->LevelFilesBrief(i).num_files && s.ok(); + j++) { + const auto& fd = vstorage->LevelFilesBrief(i).files[j].fd; + std::string fname = TableFileName(cfd->ioptions()->cf_paths, + fd.GetNumber(), fd.GetPathId()); + s = ROCKSDB_NAMESPACE::VerifySstFileChecksum(opts, file_options_, + read_options, fname); + } + } + if (!s.ok()) { + break; + } + } + bool defer_purge = + immutable_db_options().avoid_unnecessary_blocking_io; + { + InstrumentedMutexLock l(&mutex_); + for (auto sv : sv_list) { + if (sv && sv->Unref()) { + sv->Cleanup(); + if (defer_purge) { + AddSuperVersionsToFreeQueue(sv); + } else { + delete sv; + } + } + } + if (defer_purge) { + SchedulePurge(); + } + for (auto cfd : cfd_list) { + cfd->UnrefAndTryDelete(); + } + } + return s; +} + +void DBImpl::NotifyOnExternalFileIngested( + ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job) { + if (immutable_db_options_.listeners.empty()) { + return; + } + + for (const IngestedFileInfo& f : ingestion_job.files_to_ingest()) { + ExternalFileIngestionInfo info; + info.cf_name = cfd->GetName(); + info.external_file_path = f.external_file_path; + info.internal_file_path = f.internal_file_path; + info.global_seqno = f.assigned_seqno; + info.table_properties = f.table_properties; + for (auto listener : immutable_db_options_.listeners) { + listener->OnExternalFileIngested(this, info); + } + } +} + +void DBImpl::WaitForIngestFile() { + mutex_.AssertHeld(); + while (num_running_ingest_file_ > 0) { + bg_cv_.Wait(); + } +} + +Status DBImpl::StartTrace(const TraceOptions& trace_options, + std::unique_ptr&& trace_writer) { + InstrumentedMutexLock lock(&trace_mutex_); + tracer_.reset(new Tracer(env_, trace_options, std::move(trace_writer))); + return Status::OK(); +} + +Status DBImpl::EndTrace() { + InstrumentedMutexLock lock(&trace_mutex_); + Status s; + if (tracer_ != nullptr) { + s = tracer_->Close(); + tracer_.reset(); + } else { + return Status::IOError("No trace file to close"); + } + return s; +} + +Status DBImpl::StartBlockCacheTrace( + const TraceOptions& trace_options, + std::unique_ptr&& trace_writer) { + return block_cache_tracer_.StartTrace(env_, trace_options, + std::move(trace_writer)); +} + +Status DBImpl::EndBlockCacheTrace() { + block_cache_tracer_.EndTrace(); + return Status::OK(); +} + +Status DBImpl::TraceIteratorSeek(const uint32_t& cf_id, const Slice& key) { + Status s; + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + s = tracer_->IteratorSeek(cf_id, key); + } + } + return s; +} + +Status DBImpl::TraceIteratorSeekForPrev(const uint32_t& cf_id, + const Slice& key) { + Status s; + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + s = tracer_->IteratorSeekForPrev(cf_id, key); + } + } + return s; +} + +Status DBImpl::ReserveFileNumbersBeforeIngestion( + ColumnFamilyData* cfd, uint64_t num, + std::unique_ptr::iterator>& pending_output_elem, + uint64_t* next_file_number) { + Status s; + SuperVersionContext dummy_sv_ctx(true /* create_superversion */); + assert(nullptr != next_file_number); + InstrumentedMutexLock l(&mutex_); + if (error_handler_.IsDBStopped()) { + // Do not ingest files when there is a bg_error + return error_handler_.GetBGError(); + } + pending_output_elem.reset(new std::list::iterator( + CaptureCurrentFileNumberInPendingOutputs())); + *next_file_number = versions_->FetchAddFileNumber(static_cast(num)); + auto cf_options = cfd->GetLatestMutableCFOptions(); + VersionEdit dummy_edit; + // If crash happen after a hard link established, Recover function may + // reuse the file number that has already assigned to the internal file, + // and this will overwrite the external file. To protect the external + // file, we have to make sure the file number will never being reused. + s = versions_->LogAndApply(cfd, *cf_options, &dummy_edit, &mutex_, + directories_.GetDbDir()); + if (s.ok()) { + InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options); + } + dummy_sv_ctx.Clean(); + return s; +} + +Status DBImpl::GetCreationTimeOfOldestFile(uint64_t* creation_time) { + if (mutable_db_options_.max_open_files == -1) { + uint64_t oldest_time = port::kMaxUint64; + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (!cfd->IsDropped()) { + uint64_t ctime; + { + SuperVersion* sv = GetAndRefSuperVersion(cfd); + Version* version = sv->current; + version->GetCreationTimeOfOldestFile(&ctime); + ReturnAndCleanupSuperVersion(cfd, sv); + } + + if (ctime < oldest_time) { + oldest_time = ctime; + } + if (oldest_time == 0) { + break; + } + } + } + *creation_time = oldest_time; + return Status::OK(); + } else { + return Status::NotSupported("This API only works if max_open_files = -1"); + } +} +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_impl/db_impl.h b/src/rocksdb/db/db_impl/db_impl.h new file mode 100644 index 000000000..119555cb4 --- /dev/null +++ b/src/rocksdb/db/db_impl/db_impl.h @@ -0,0 +1,2107 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/column_family.h" +#include "db/compaction/compaction_job.h" +#include "db/dbformat.h" +#include "db/error_handler.h" +#include "db/event_helpers.h" +#include "db/external_sst_file_ingestion_job.h" +#include "db/flush_job.h" +#include "db/flush_scheduler.h" +#include "db/import_column_family_job.h" +#include "db/internal_stats.h" +#include "db/log_writer.h" +#include "db/logs_with_prep_tracker.h" +#include "db/memtable_list.h" +#include "db/pre_release_callback.h" +#include "db/range_del_aggregator.h" +#include "db/read_callback.h" +#include "db/snapshot_checker.h" +#include "db/snapshot_impl.h" +#include "db/trim_history_scheduler.h" +#include "db/version_edit.h" +#include "db/wal_manager.h" +#include "db/write_controller.h" +#include "db/write_thread.h" +#include "logging/event_logger.h" +#include "monitoring/instrumented_mutex.h" +#include "options/db_options.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/status.h" +#include "rocksdb/trace_reader_writer.h" +#include "rocksdb/transaction_log.h" +#include "rocksdb/write_buffer_manager.h" +#include "table/scoped_arena_iterator.h" +#include "trace_replay/block_cache_tracer.h" +#include "trace_replay/trace_replay.h" +#include "util/autovector.h" +#include "util/hash.h" +#include "util/repeatable_thread.h" +#include "util/stop_watch.h" +#include "util/thread_local.h" + +namespace ROCKSDB_NAMESPACE { + +class Arena; +class ArenaWrappedDBIter; +class InMemoryStatsHistoryIterator; +class MemTable; +class PersistentStatsHistoryIterator; +class TableCache; +class TaskLimiterToken; +class Version; +class VersionEdit; +class VersionSet; +class WriteCallback; +struct JobContext; +struct ExternalSstFileInfo; +struct MemTableInfo; + +// Class to maintain directories for all database paths other than main one. +class Directories { + public: + Status SetDirectories(Env* env, const std::string& dbname, + const std::string& wal_dir, + const std::vector& data_paths); + + Directory* GetDataDir(size_t path_id) const { + assert(path_id < data_dirs_.size()); + Directory* ret_dir = data_dirs_[path_id].get(); + if (ret_dir == nullptr) { + // Should use db_dir_ + return db_dir_.get(); + } + return ret_dir; + } + + Directory* GetWalDir() { + if (wal_dir_) { + return wal_dir_.get(); + } + return db_dir_.get(); + } + + Directory* GetDbDir() { return db_dir_.get(); } + + private: + std::unique_ptr db_dir_; + std::vector> data_dirs_; + std::unique_ptr wal_dir_; +}; + +// While DB is the public interface of RocksDB, and DBImpl is the actual +// class implementing it. It's the entrance of the core RocksdB engine. +// All other DB implementations, e.g. TransactionDB, BlobDB, etc, wrap a +// DBImpl internally. +// Other than functions implementing the DB interface, some public +// functions are there for other internal components to call. For +// example, TransactionDB directly calls DBImpl::WriteImpl() and +// BlobDB directly calls DBImpl::GetImpl(). Some other functions +// are for sub-components to call. For example, ColumnFamilyHandleImpl +// calls DBImpl::FindObsoleteFiles(). +// +// Since it's a very large class, the definition of the functions is +// divided in several db_impl_*.cc files, besides db_impl.cc. +class DBImpl : public DB { + public: + DBImpl(const DBOptions& options, const std::string& dbname, + const bool seq_per_batch = false, const bool batch_per_txn = true); + // No copying allowed + DBImpl(const DBImpl&) = delete; + void operator=(const DBImpl&) = delete; + + virtual ~DBImpl(); + + // ---- Implementations of the DB interface ---- + + using DB::Resume; + virtual Status Resume() override; + + using DB::Put; + virtual Status Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + using DB::Merge; + virtual Status Merge(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + using DB::Delete; + virtual Status Delete(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key) override; + using DB::SingleDelete; + virtual Status SingleDelete(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key) override; + using DB::Write; + virtual Status Write(const WriteOptions& options, + WriteBatch* updates) override; + + using DB::Get; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) override; + + using DB::GetMergeOperands; + Status GetMergeOperands(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* merge_operands, + GetMergeOperandsOptions* get_merge_operands_options, + int* number_of_operands) override { + GetImplOptions get_impl_options; + get_impl_options.column_family = column_family; + get_impl_options.merge_operands = merge_operands; + get_impl_options.get_merge_operands_options = get_merge_operands_options; + get_impl_options.number_of_operands = number_of_operands; + get_impl_options.get_value = false; + return GetImpl(options, key, get_impl_options); + } + + using DB::MultiGet; + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, + std::vector* values) override; + + // This MultiGet is a batched version, which may be faster than calling Get + // multiple times, especially if the keys have some spatial locality that + // enables them to be queried in the same SST files/set of files. The larger + // the batch size, the more scope for batching and performance improvement + // The values and statuses parameters are arrays with number of elements + // equal to keys.size(). This allows the storage for those to be alloacted + // by the caller on the stack for small batches + virtual void MultiGet(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool sorted_input = false) override; + + virtual void MultiGet(const ReadOptions& options, const size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool sorted_input = false) override; + + virtual void MultiGetWithCallback( + const ReadOptions& options, ColumnFamilyHandle* column_family, + ReadCallback* callback, + autovector* sorted_keys); + + virtual Status CreateColumnFamily(const ColumnFamilyOptions& cf_options, + const std::string& column_family, + ColumnFamilyHandle** handle) override; + virtual Status CreateColumnFamilies( + const ColumnFamilyOptions& cf_options, + const std::vector& column_family_names, + std::vector* handles) override; + virtual Status CreateColumnFamilies( + const std::vector& column_families, + std::vector* handles) override; + virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override; + virtual Status DropColumnFamilies( + const std::vector& column_families) override; + + // Returns false if key doesn't exist in the database and true if it may. + // If value_found is not passed in as null, then return the value if found in + // memory. On return, if value was found, then value_found will be set to true + // , otherwise false. + using DB::KeyMayExist; + virtual bool KeyMayExist(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, + bool* value_found = nullptr) override; + + using DB::NewIterator; + virtual Iterator* NewIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) override; + virtual Status NewIterators( + const ReadOptions& options, + const std::vector& column_families, + std::vector* iterators) override; + + virtual const Snapshot* GetSnapshot() override; + virtual void ReleaseSnapshot(const Snapshot* snapshot) override; + using DB::GetProperty; + virtual bool GetProperty(ColumnFamilyHandle* column_family, + const Slice& property, std::string* value) override; + using DB::GetMapProperty; + virtual bool GetMapProperty( + ColumnFamilyHandle* column_family, const Slice& property, + std::map* value) override; + using DB::GetIntProperty; + virtual bool GetIntProperty(ColumnFamilyHandle* column_family, + const Slice& property, uint64_t* value) override; + using DB::GetAggregatedIntProperty; + virtual bool GetAggregatedIntProperty(const Slice& property, + uint64_t* aggregated_value) override; + using DB::GetApproximateSizes; + virtual Status GetApproximateSizes(const SizeApproximationOptions& options, + ColumnFamilyHandle* column_family, + const Range* range, int n, + uint64_t* sizes) override; + using DB::GetApproximateMemTableStats; + virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family, + const Range& range, + uint64_t* const count, + uint64_t* const size) override; + using DB::CompactRange; + virtual Status CompactRange(const CompactRangeOptions& options, + ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end) override; + + using DB::CompactFiles; + virtual Status CompactFiles( + const CompactionOptions& compact_options, + ColumnFamilyHandle* column_family, + const std::vector& input_file_names, const int output_level, + const int output_path_id = -1, + std::vector* const output_file_names = nullptr, + CompactionJobInfo* compaction_job_info = nullptr) override; + + virtual Status PauseBackgroundWork() override; + virtual Status ContinueBackgroundWork() override; + + virtual Status EnableAutoCompaction( + const std::vector& column_family_handles) override; + + virtual void EnableManualCompaction() override; + virtual void DisableManualCompaction() override; + + using DB::SetOptions; + Status SetOptions( + ColumnFamilyHandle* column_family, + const std::unordered_map& options_map) override; + + virtual Status SetDBOptions( + const std::unordered_map& options_map) override; + + using DB::NumberLevels; + virtual int NumberLevels(ColumnFamilyHandle* column_family) override; + using DB::MaxMemCompactionLevel; + virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) override; + using DB::Level0StopWriteTrigger; + virtual int Level0StopWriteTrigger( + ColumnFamilyHandle* column_family) override; + virtual const std::string& GetName() const override; + virtual Env* GetEnv() const override; + virtual FileSystem* GetFileSystem() const override; + using DB::GetOptions; + virtual Options GetOptions(ColumnFamilyHandle* column_family) const override; + using DB::GetDBOptions; + virtual DBOptions GetDBOptions() const override; + using DB::Flush; + virtual Status Flush(const FlushOptions& options, + ColumnFamilyHandle* column_family) override; + virtual Status Flush( + const FlushOptions& options, + const std::vector& column_families) override; + virtual Status FlushWAL(bool sync) override; + bool TEST_WALBufferIsEmpty(bool lock = true); + virtual Status SyncWAL() override; + virtual Status LockWAL() override; + virtual Status UnlockWAL() override; + + virtual SequenceNumber GetLatestSequenceNumber() const override; + + virtual bool SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) override; + + virtual Status GetDbIdentity(std::string& identity) const override; + + virtual Status GetDbIdentityFromIdentityFile(std::string* identity) const; + + ColumnFamilyHandle* DefaultColumnFamily() const override; + + ColumnFamilyHandle* PersistentStatsColumnFamily() const; + + virtual Status Close() override; + + Status GetStatsHistory( + uint64_t start_time, uint64_t end_time, + std::unique_ptr* stats_iterator) override; + +#ifndef ROCKSDB_LITE + using DB::ResetStats; + virtual Status ResetStats() override; + virtual Status DisableFileDeletions() override; + virtual Status EnableFileDeletions(bool force) override; + virtual int IsFileDeletionsEnabled() const; + // All the returned filenames start with "/" + virtual Status GetLiveFiles(std::vector&, + uint64_t* manifest_file_size, + bool flush_memtable = true) override; + virtual Status GetSortedWalFiles(VectorLogPtr& files) override; + virtual Status GetCurrentWalFile( + std::unique_ptr* current_log_file) override; + virtual Status GetCreationTimeOfOldestFile( + uint64_t* creation_time) override; + + virtual Status GetUpdatesSince( + SequenceNumber seq_number, std::unique_ptr* iter, + const TransactionLogIterator::ReadOptions& read_options = + TransactionLogIterator::ReadOptions()) override; + virtual Status DeleteFile(std::string name) override; + Status DeleteFilesInRanges(ColumnFamilyHandle* column_family, + const RangePtr* ranges, size_t n, + bool include_end = true); + + virtual void GetLiveFilesMetaData( + std::vector* metadata) override; + + // Obtains the meta data of the specified column family of the DB. + // Status::NotFound() will be returned if the current DB does not have + // any column family match the specified name. + // TODO(yhchiang): output parameter is placed in the end in this codebase. + virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family, + ColumnFamilyMetaData* metadata) override; + + Status SuggestCompactRange(ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end) override; + + Status PromoteL0(ColumnFamilyHandle* column_family, + int target_level) override; + + using DB::IngestExternalFile; + virtual Status IngestExternalFile( + ColumnFamilyHandle* column_family, + const std::vector& external_files, + const IngestExternalFileOptions& ingestion_options) override; + + using DB::IngestExternalFiles; + virtual Status IngestExternalFiles( + const std::vector& args) override; + + using DB::CreateColumnFamilyWithImport; + virtual Status CreateColumnFamilyWithImport( + const ColumnFamilyOptions& options, const std::string& column_family_name, + const ImportColumnFamilyOptions& import_options, + const ExportImportFilesMetaData& metadata, + ColumnFamilyHandle** handle) override; + + using DB::VerifyChecksum; + virtual Status VerifyChecksum(const ReadOptions& /*read_options*/) override; + + using DB::StartTrace; + virtual Status StartTrace( + const TraceOptions& options, + std::unique_ptr&& trace_writer) override; + + using DB::EndTrace; + virtual Status EndTrace() override; + + using DB::StartBlockCacheTrace; + Status StartBlockCacheTrace( + const TraceOptions& options, + std::unique_ptr&& trace_writer) override; + + using DB::EndBlockCacheTrace; + Status EndBlockCacheTrace() override; + + using DB::GetPropertiesOfAllTables; + virtual Status GetPropertiesOfAllTables( + ColumnFamilyHandle* column_family, + TablePropertiesCollection* props) override; + virtual Status GetPropertiesOfTablesInRange( + ColumnFamilyHandle* column_family, const Range* range, std::size_t n, + TablePropertiesCollection* props) override; + +#endif // ROCKSDB_LITE + + // ---- End of implementations of the DB interface ---- + + struct GetImplOptions { + ColumnFamilyHandle* column_family = nullptr; + PinnableSlice* value = nullptr; + bool* value_found = nullptr; + ReadCallback* callback = nullptr; + bool* is_blob_index = nullptr; + // If true return value associated with key via value pointer else return + // all merge operands for key via merge_operands pointer + bool get_value = true; + // Pointer to an array of size + // get_merge_operands_options.expected_max_number_of_operands allocated by + // user + PinnableSlice* merge_operands = nullptr; + GetMergeOperandsOptions* get_merge_operands_options = nullptr; + int* number_of_operands = nullptr; + }; + + // Function that Get and KeyMayExist call with no_io true or false + // Note: 'value_found' from KeyMayExist propagates here + // This function is also called by GetMergeOperands + // If get_impl_options.get_value = true get value associated with + // get_impl_options.key via get_impl_options.value + // If get_impl_options.get_value = false get merge operands associated with + // get_impl_options.key via get_impl_options.merge_operands + Status GetImpl(const ReadOptions& options, const Slice& key, + GetImplOptions get_impl_options); + + ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& options, + ColumnFamilyData* cfd, + SequenceNumber snapshot, + ReadCallback* read_callback, + bool allow_blob = false, + bool allow_refresh = true); + + virtual SequenceNumber GetLastPublishedSequence() const { + if (last_seq_same_as_publish_seq_) { + return versions_->LastSequence(); + } else { + return versions_->LastPublishedSequence(); + } + } + + // REQUIRES: joined the main write queue if two_write_queues is disabled, and + // the second write queue otherwise. + virtual void SetLastPublishedSequence(SequenceNumber seq); + // Returns LastSequence in last_seq_same_as_publish_seq_ + // mode and LastAllocatedSequence otherwise. This is useful when visiblility + // depends also on data written to the WAL but not to the memtable. + SequenceNumber TEST_GetLastVisibleSequence() const; + +#ifndef ROCKSDB_LITE + // Similar to Write() but will call the callback once on the single write + // thread to determine whether it is safe to perform the write. + virtual Status WriteWithCallback(const WriteOptions& write_options, + WriteBatch* my_batch, + WriteCallback* callback); + + // Returns the sequence number that is guaranteed to be smaller than or equal + // to the sequence number of any key that could be inserted into the current + // memtables. It can then be assumed that any write with a larger(or equal) + // sequence number will be present in this memtable or a later memtable. + // + // If the earliest sequence number could not be determined, + // kMaxSequenceNumber will be returned. + // + // If include_history=true, will also search Memtables in MemTableList + // History. + SequenceNumber GetEarliestMemTableSequenceNumber(SuperVersion* sv, + bool include_history); + + // For a given key, check to see if there are any records for this key + // in the memtables, including memtable history. If cache_only is false, + // SST files will also be checked. + // + // If a key is found, *found_record_for_key will be set to true and + // *seq will be set to the stored sequence number for the latest + // operation on this key or kMaxSequenceNumber if unknown. + // If no key is found, *found_record_for_key will be set to false. + // + // Note: If cache_only=false, it is possible for *seq to be set to 0 if + // the sequence number has been cleared from the record. If the caller is + // holding an active db snapshot, we know the missing sequence must be less + // than the snapshot's sequence number (sequence numbers are only cleared + // when there are no earlier active snapshots). + // + // If NotFound is returned and found_record_for_key is set to false, then no + // record for this key was found. If the caller is holding an active db + // snapshot, we know that no key could have existing after this snapshot + // (since we do not compact keys that have an earlier snapshot). + // + // Only records newer than or at `lower_bound_seq` are guaranteed to be + // returned. Memtables and files may not be checked if it only contains data + // older than `lower_bound_seq`. + // + // Returns OK or NotFound on success, + // other status on unexpected error. + // TODO(andrewkr): this API need to be aware of range deletion operations + Status GetLatestSequenceForKey(SuperVersion* sv, const Slice& key, + bool cache_only, + SequenceNumber lower_bound_seq, + SequenceNumber* seq, + bool* found_record_for_key, + bool* is_blob_index = nullptr); + + Status TraceIteratorSeek(const uint32_t& cf_id, const Slice& key); + Status TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key); +#endif // ROCKSDB_LITE + + // Similar to GetSnapshot(), but also lets the db know that this snapshot + // will be used for transaction write-conflict checking. The DB can then + // make sure not to compact any keys that would prevent a write-conflict from + // being detected. + const Snapshot* GetSnapshotForWriteConflictBoundary(); + + // checks if all live files exist on file system and that their file sizes + // match to our in-memory records + virtual Status CheckConsistency(); + + // max_file_num_to_ignore allows bottom level compaction to filter out newly + // compacted SST files. Setting max_file_num_to_ignore to kMaxUint64 will + // disable the filtering + Status RunManualCompaction(ColumnFamilyData* cfd, int input_level, + int output_level, + const CompactRangeOptions& compact_range_options, + const Slice* begin, const Slice* end, + bool exclusive, bool disallow_trivial_move, + uint64_t max_file_num_to_ignore); + + // Return an internal iterator over the current state of the database. + // The keys of this iterator are internal keys (see format.h). + // The returned iterator should be deleted when no longer needed. + InternalIterator* NewInternalIterator( + Arena* arena, RangeDelAggregator* range_del_agg, SequenceNumber sequence, + ColumnFamilyHandle* column_family = nullptr); + + LogsWithPrepTracker* logs_with_prep_tracker() { + return &logs_with_prep_tracker_; + } + + struct BGJobLimits { + int max_flushes; + int max_compactions; + }; + // Returns maximum background flushes and compactions allowed to be scheduled + BGJobLimits GetBGJobLimits() const; + // Need a static version that can be called during SanitizeOptions(). + static BGJobLimits GetBGJobLimits(int max_background_flushes, + int max_background_compactions, + int max_background_jobs, + bool parallelize_compactions); + + // move logs pending closing from job_context to the DB queue and + // schedule a purge + void ScheduleBgLogWriterClose(JobContext* job_context); + + uint64_t MinLogNumberToKeep(); + + // Returns the lower bound file number for SSTs that won't be deleted, even if + // they're obsolete. This lower bound is used internally to prevent newly + // created flush/compaction output files from being deleted before they're + // installed. This technique avoids the need for tracking the exact numbers of + // files pending creation, although it prevents more files than necessary from + // being deleted. + uint64_t MinObsoleteSstNumberToKeep(); + + // Returns the list of live files in 'live' and the list + // of all files in the filesystem in 'candidate_files'. + // If force == false and the last call was less than + // db_options_.delete_obsolete_files_period_micros microseconds ago, + // it will not fill up the job_context + void FindObsoleteFiles(JobContext* job_context, bool force, + bool no_full_scan = false); + + // Diffs the files listed in filenames and those that do not + // belong to live files are possibly removed. Also, removes all the + // files in sst_delete_files and log_delete_files. + // It is not necessary to hold the mutex when invoking this method. + // If FindObsoleteFiles() was run, we need to also run + // PurgeObsoleteFiles(), even if disable_delete_obsolete_files_ is true + void PurgeObsoleteFiles(JobContext& background_contet, + bool schedule_only = false); + + // Schedule a background job to actually delete obsolete files. + void SchedulePurge(); + + const SnapshotList& snapshots() const { return snapshots_; } + + // load list of snapshots to `snap_vector` that is no newer than `max_seq` + // in ascending order. + // `oldest_write_conflict_snapshot` is filled with the oldest snapshot + // which satisfies SnapshotImpl.is_write_conflict_boundary_ = true. + void LoadSnapshots(std::vector* snap_vector, + SequenceNumber* oldest_write_conflict_snapshot, + const SequenceNumber& max_seq) const { + InstrumentedMutexLock l(mutex()); + snapshots().GetAll(snap_vector, oldest_write_conflict_snapshot, max_seq); + } + + const ImmutableDBOptions& immutable_db_options() const { + return immutable_db_options_; + } + + // Cancel all background jobs, including flush, compaction, background + // purging, stats dumping threads, etc. If `wait` = true, wait for the + // running jobs to abort or finish before returning. Otherwise, only + // sends the signals. + void CancelAllBackgroundWork(bool wait); + + // Find Super version and reference it. Based on options, it might return + // the thread local cached one. + // Call ReturnAndCleanupSuperVersion() when it is no longer needed. + SuperVersion* GetAndRefSuperVersion(ColumnFamilyData* cfd); + + // Similar to the previous function but looks up based on a column family id. + // nullptr will be returned if this column family no longer exists. + // REQUIRED: this function should only be called on the write thread or if the + // mutex is held. + SuperVersion* GetAndRefSuperVersion(uint32_t column_family_id); + + // Un-reference the super version and clean it up if it is the last reference. + void CleanupSuperVersion(SuperVersion* sv); + + // Un-reference the super version and return it to thread local cache if + // needed. If it is the last reference of the super version. Clean it up + // after un-referencing it. + void ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd, SuperVersion* sv); + + // Similar to the previous function but looks up based on a column family id. + // nullptr will be returned if this column family no longer exists. + // REQUIRED: this function should only be called on the write thread. + void ReturnAndCleanupSuperVersion(uint32_t colun_family_id, SuperVersion* sv); + + // REQUIRED: this function should only be called on the write thread or if the + // mutex is held. Return value only valid until next call to this function or + // mutex is released. + ColumnFamilyHandle* GetColumnFamilyHandle(uint32_t column_family_id); + + // Same as above, should called without mutex held and not on write thread. + std::unique_ptr GetColumnFamilyHandleUnlocked( + uint32_t column_family_id); + + // Returns the number of currently running flushes. + // REQUIREMENT: mutex_ must be held when calling this function. + int num_running_flushes() { + mutex_.AssertHeld(); + return num_running_flushes_; + } + + // Returns the number of currently running compactions. + // REQUIREMENT: mutex_ must be held when calling this function. + int num_running_compactions() { + mutex_.AssertHeld(); + return num_running_compactions_; + } + + const WriteController& write_controller() { return write_controller_; } + + InternalIterator* NewInternalIterator( + const ReadOptions&, ColumnFamilyData* cfd, SuperVersion* super_version, + Arena* arena, RangeDelAggregator* range_del_agg, SequenceNumber sequence); + + // hollow transactions shell used for recovery. + // these will then be passed to TransactionDB so that + // locks can be reacquired before writing can resume. + struct RecoveredTransaction { + std::string name_; + bool unprepared_; + + struct BatchInfo { + uint64_t log_number_; + // TODO(lth): For unprepared, the memory usage here can be big for + // unprepared transactions. This is only useful for rollbacks, and we + // can in theory just keep keyset for that. + WriteBatch* batch_; + // Number of sub-batches. A new sub-batch is created if txn attempts to + // insert a duplicate key,seq to memtable. This is currently used in + // WritePreparedTxn/WriteUnpreparedTxn. + size_t batch_cnt_; + }; + + // This maps the seq of the first key in the batch to BatchInfo, which + // contains WriteBatch and other information relevant to the batch. + // + // For WriteUnprepared, batches_ can have size greater than 1, but for + // other write policies, it must be of size 1. + std::map batches_; + + explicit RecoveredTransaction(const uint64_t log, const std::string& name, + WriteBatch* batch, SequenceNumber seq, + size_t batch_cnt, bool unprepared) + : name_(name), unprepared_(unprepared) { + batches_[seq] = {log, batch, batch_cnt}; + } + + ~RecoveredTransaction() { + for (auto& it : batches_) { + delete it.second.batch_; + } + } + + void AddBatch(SequenceNumber seq, uint64_t log_number, WriteBatch* batch, + size_t batch_cnt, bool unprepared) { + assert(batches_.count(seq) == 0); + batches_[seq] = {log_number, batch, batch_cnt}; + // Prior state must be unprepared, since the prepare batch must be the + // last batch. + assert(unprepared_); + unprepared_ = unprepared; + } + }; + + bool allow_2pc() const { return immutable_db_options_.allow_2pc; } + + std::unordered_map + recovered_transactions() { + return recovered_transactions_; + } + + RecoveredTransaction* GetRecoveredTransaction(const std::string& name) { + auto it = recovered_transactions_.find(name); + if (it == recovered_transactions_.end()) { + return nullptr; + } else { + return it->second; + } + } + + void InsertRecoveredTransaction(const uint64_t log, const std::string& name, + WriteBatch* batch, SequenceNumber seq, + size_t batch_cnt, bool unprepared_batch) { + // For WriteUnpreparedTxn, InsertRecoveredTransaction is called multiple + // times for every unprepared batch encountered during recovery. + // + // If the transaction is prepared, then the last call to + // InsertRecoveredTransaction will have unprepared_batch = false. + auto rtxn = recovered_transactions_.find(name); + if (rtxn == recovered_transactions_.end()) { + recovered_transactions_[name] = new RecoveredTransaction( + log, name, batch, seq, batch_cnt, unprepared_batch); + } else { + rtxn->second->AddBatch(seq, log, batch, batch_cnt, unprepared_batch); + } + logs_with_prep_tracker_.MarkLogAsContainingPrepSection(log); + } + + void DeleteRecoveredTransaction(const std::string& name) { + auto it = recovered_transactions_.find(name); + assert(it != recovered_transactions_.end()); + auto* trx = it->second; + recovered_transactions_.erase(it); + for (const auto& info : trx->batches_) { + logs_with_prep_tracker_.MarkLogAsHavingPrepSectionFlushed( + info.second.log_number_); + } + delete trx; + } + + void DeleteAllRecoveredTransactions() { + for (auto it = recovered_transactions_.begin(); + it != recovered_transactions_.end(); ++it) { + delete it->second; + } + recovered_transactions_.clear(); + } + + void AddToLogsToFreeQueue(log::Writer* log_writer) { + logs_to_free_queue_.push_back(log_writer); + } + + void AddSuperVersionsToFreeQueue(SuperVersion* sv) { + superversions_to_free_queue_.push_back(sv); + } + + void SetSnapshotChecker(SnapshotChecker* snapshot_checker); + + // Fill JobContext with snapshot information needed by flush and compaction. + void GetSnapshotContext(JobContext* job_context, + std::vector* snapshot_seqs, + SequenceNumber* earliest_write_conflict_snapshot, + SnapshotChecker** snapshot_checker); + + // Not thread-safe. + void SetRecoverableStatePreReleaseCallback(PreReleaseCallback* callback); + + InstrumentedMutex* mutex() const { return &mutex_; } + + // Initialize a brand new DB. The DB directory is expected to be empty before + // calling it. + Status NewDB(); + + // This is to be used only by internal rocksdb classes. + static Status Open(const DBOptions& db_options, const std::string& name, + const std::vector& column_families, + std::vector* handles, DB** dbptr, + const bool seq_per_batch, const bool batch_per_txn); + + static Status CreateAndNewDirectory(Env* env, const std::string& dirname, + std::unique_ptr* directory); + + // find stats map from stats_history_ with smallest timestamp in + // the range of [start_time, end_time) + bool FindStatsByTime(uint64_t start_time, uint64_t end_time, + uint64_t* new_time, + std::map* stats_map); + + // Print information of all tombstones of all iterators to the std::string + // This is only used by ldb. The output might be capped. Tombstones + // printed out are not guaranteed to be in any order. + Status TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family, + int max_entries_to_print, + std::string* out_str); + +#ifndef NDEBUG + // Compact any files in the named level that overlap [*begin, *end] + Status TEST_CompactRange(int level, const Slice* begin, const Slice* end, + ColumnFamilyHandle* column_family = nullptr, + bool disallow_trivial_move = false); + + void TEST_SwitchWAL(); + + bool TEST_UnableToReleaseOldestLog() { return unable_to_release_oldest_log_; } + + bool TEST_IsLogGettingFlushed() { + return alive_log_files_.begin()->getting_flushed; + } + + Status TEST_SwitchMemtable(ColumnFamilyData* cfd = nullptr); + + // Force current memtable contents to be flushed. + Status TEST_FlushMemTable(bool wait = true, bool allow_write_stall = false, + ColumnFamilyHandle* cfh = nullptr); + + Status TEST_FlushMemTable(ColumnFamilyData* cfd, + const FlushOptions& flush_opts); + + // Flush (multiple) ColumnFamilyData without using ColumnFamilyHandle. This + // is because in certain cases, we can flush column families, wait for the + // flush to complete, but delete the column family handle before the wait + // finishes. For example in CompactRange. + Status TEST_AtomicFlushMemTables(const autovector& cfds, + const FlushOptions& flush_opts); + + // Wait for memtable compaction + Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr); + + // Wait for any compaction + // We add a bool parameter to wait for unscheduledCompactions_ == 0, but this + // is only for the special test of CancelledCompactions + Status TEST_WaitForCompact(bool waitUnscheduled = false); + + // Return the maximum overlapping data (in bytes) at next level for any + // file at a level >= 1. + int64_t TEST_MaxNextLevelOverlappingBytes( + ColumnFamilyHandle* column_family = nullptr); + + // Return the current manifest file no. + uint64_t TEST_Current_Manifest_FileNo(); + + // Returns the number that'll be assigned to the next file that's created. + uint64_t TEST_Current_Next_FileNo(); + + // get total level0 file size. Only for testing. + uint64_t TEST_GetLevel0TotalSize(); + + void TEST_GetFilesMetaData(ColumnFamilyHandle* column_family, + std::vector>* metadata); + + void TEST_LockMutex(); + + void TEST_UnlockMutex(); + + // REQUIRES: mutex locked + void* TEST_BeginWrite(); + + // REQUIRES: mutex locked + // pass the pointer that you got from TEST_BeginWrite() + void TEST_EndWrite(void* w); + + uint64_t TEST_MaxTotalInMemoryState() const { + return max_total_in_memory_state_; + } + + size_t TEST_LogsToFreeSize(); + + uint64_t TEST_LogfileNumber(); + + uint64_t TEST_total_log_size() const { return total_log_size_; } + + // Returns column family name to ImmutableCFOptions map. + Status TEST_GetAllImmutableCFOptions( + std::unordered_map* iopts_map); + + // Return the lastest MutableCFOptions of a column family + Status TEST_GetLatestMutableCFOptions(ColumnFamilyHandle* column_family, + MutableCFOptions* mutable_cf_options); + + Cache* TEST_table_cache() { return table_cache_.get(); } + + WriteController& TEST_write_controler() { return write_controller_; } + + uint64_t TEST_FindMinLogContainingOutstandingPrep(); + uint64_t TEST_FindMinPrepLogReferencedByMemTable(); + size_t TEST_PreparedSectionCompletedSize(); + size_t TEST_LogsWithPrepSize(); + + int TEST_BGCompactionsAllowed() const; + int TEST_BGFlushesAllowed() const; + size_t TEST_GetWalPreallocateBlockSize(uint64_t write_buffer_size) const; + void TEST_WaitForDumpStatsRun(std::function callback) const; + void TEST_WaitForPersistStatsRun(std::function callback) const; + bool TEST_IsPersistentStatsEnabled() const; + size_t TEST_EstimateInMemoryStatsHistorySize() const; +#endif // NDEBUG + + protected: + const std::string dbname_; + std::string db_id_; + std::unique_ptr versions_; + // Flag to check whether we allocated and own the info log file + bool own_info_log_; + const DBOptions initial_db_options_; + Env* const env_; + std::shared_ptr fs_; + const ImmutableDBOptions immutable_db_options_; + MutableDBOptions mutable_db_options_; + Statistics* stats_; + std::unordered_map + recovered_transactions_; + std::unique_ptr tracer_; + InstrumentedMutex trace_mutex_; + BlockCacheTracer block_cache_tracer_; + + // State below is protected by mutex_ + // With two_write_queues enabled, some of the variables that accessed during + // WriteToWAL need different synchronization: log_empty_, alive_log_files_, + // logs_, logfile_number_. Refer to the definition of each variable below for + // more description. + mutable InstrumentedMutex mutex_; + + ColumnFamilyHandleImpl* default_cf_handle_; + InternalStats* default_cf_internal_stats_; + + // only used for dynamically adjusting max_total_wal_size. it is a sum of + // [write_buffer_size * max_write_buffer_number] over all column families + uint64_t max_total_in_memory_state_; + // If true, we have only one (default) column family. We use this to optimize + // some code-paths + bool single_column_family_mode_; + + // The options to access storage files + const FileOptions file_options_; + + // Additonal options for compaction and flush + FileOptions file_options_for_compaction_; + + std::unique_ptr column_family_memtables_; + + // Increase the sequence number after writing each batch, whether memtable is + // disabled for that or not. Otherwise the sequence number is increased after + // writing each key into memtable. This implies that when disable_memtable is + // set, the seq is not increased at all. + // + // Default: false + const bool seq_per_batch_; + // This determines during recovery whether we expect one writebatch per + // recovered transaction, or potentially multiple writebatches per + // transaction. For WriteUnprepared, this is set to false, since multiple + // batches can exist per transaction. + // + // Default: true + const bool batch_per_txn_; + + // Except in DB::Open(), WriteOptionsFile can only be called when: + // Persist options to options file. + // If need_mutex_lock = false, the method will lock DB mutex. + // If need_enter_write_thread = false, the method will enter write thread. + Status WriteOptionsFile(bool need_mutex_lock, bool need_enter_write_thread); + + // The following two functions can only be called when: + // 1. WriteThread::Writer::EnterUnbatched() is used. + // 2. db_mutex is NOT held + Status RenameTempFileToOptionsFile(const std::string& file_name); + Status DeleteObsoleteOptionsFiles(); + + void NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta, + const MutableCFOptions& mutable_cf_options, + int job_id); + + void NotifyOnFlushCompleted( + ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, + std::list>* flush_jobs_info); + + void NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c, + const Status& st, + const CompactionJobStats& job_stats, int job_id); + + void NotifyOnCompactionCompleted(ColumnFamilyData* cfd, Compaction* c, + const Status& st, + const CompactionJobStats& job_stats, + int job_id); + void NotifyOnMemTableSealed(ColumnFamilyData* cfd, + const MemTableInfo& mem_table_info); + +#ifndef ROCKSDB_LITE + void NotifyOnExternalFileIngested( + ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job); +#endif // !ROCKSDB_LITE + + void NewThreadStatusCfInfo(ColumnFamilyData* cfd) const; + + void EraseThreadStatusCfInfo(ColumnFamilyData* cfd) const; + + void EraseThreadStatusDbInfo() const; + + // If disable_memtable is set the application logic must guarantee that the + // batch will still be skipped from memtable during the recovery. An excption + // to this is seq_per_batch_ mode, in which since each batch already takes one + // seq, it is ok for the batch to write to memtable during recovery as long as + // it only takes one sequence number: i.e., no duplicate keys. + // In WriteCommitted it is guarnateed since disable_memtable is used for + // prepare batch which will be written to memtable later during the commit, + // and in WritePrepared it is guaranteed since it will be used only for WAL + // markers which will never be written to memtable. If the commit marker is + // accompanied with CommitTimeWriteBatch that is not written to memtable as + // long as it has no duplicate keys, it does not violate the one-seq-per-batch + // policy. + // batch_cnt is expected to be non-zero in seq_per_batch mode and + // indicates the number of sub-patches. A sub-patch is a subset of the write + // batch that does not have duplicate keys. + Status WriteImpl(const WriteOptions& options, WriteBatch* updates, + WriteCallback* callback = nullptr, + uint64_t* log_used = nullptr, uint64_t log_ref = 0, + bool disable_memtable = false, uint64_t* seq_used = nullptr, + size_t batch_cnt = 0, + PreReleaseCallback* pre_release_callback = nullptr); + + Status PipelinedWriteImpl(const WriteOptions& options, WriteBatch* updates, + WriteCallback* callback = nullptr, + uint64_t* log_used = nullptr, uint64_t log_ref = 0, + bool disable_memtable = false, + uint64_t* seq_used = nullptr); + + // Write only to memtables without joining any write queue + Status UnorderedWriteMemtable(const WriteOptions& write_options, + WriteBatch* my_batch, WriteCallback* callback, + uint64_t log_ref, SequenceNumber seq, + const size_t sub_batch_cnt); + + // Whether the batch requires to be assigned with an order + enum AssignOrder : bool { kDontAssignOrder, kDoAssignOrder }; + // Whether it requires publishing last sequence or not + enum PublishLastSeq : bool { kDontPublishLastSeq, kDoPublishLastSeq }; + + // Join the write_thread to write the batch only to the WAL. It is the + // responsibility of the caller to also write the write batch to the memtable + // if it required. + // + // sub_batch_cnt is expected to be non-zero when assign_order = kDoAssignOrder + // indicating the number of sub-batches in my_batch. A sub-patch is a subset + // of the write batch that does not have duplicate keys. When seq_per_batch is + // not set, each key is a separate sub_batch. Otherwise each duplicate key + // marks start of a new sub-batch. + Status WriteImplWALOnly( + WriteThread* write_thread, const WriteOptions& options, + WriteBatch* updates, WriteCallback* callback, uint64_t* log_used, + const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt, + PreReleaseCallback* pre_release_callback, const AssignOrder assign_order, + const PublishLastSeq publish_last_seq, const bool disable_memtable); + + // write cached_recoverable_state_ to memtable if it is not empty + // The writer must be the leader in write_thread_ and holding mutex_ + Status WriteRecoverableState(); + + // Actual implementation of Close() + Status CloseImpl(); + + // Recover the descriptor from persistent storage. May do a significant + // amount of work to recover recently logged updates. Any changes to + // be made to the descriptor are added to *edit. + // recovered_seq is set to less than kMaxSequenceNumber if the log's tail is + // skipped. + virtual Status Recover( + const std::vector& column_families, + bool read_only = false, bool error_if_log_file_exist = false, + bool error_if_data_exists_in_logs = false, + uint64_t* recovered_seq = nullptr); + + virtual bool OwnTablesAndLogs() const { return true; } + + private: + friend class DB; + friend class ErrorHandler; + friend class InternalStats; + friend class PessimisticTransaction; + friend class TransactionBaseImpl; + friend class WriteCommittedTxn; + friend class WritePreparedTxn; + friend class WritePreparedTxnDB; + friend class WriteBatchWithIndex; + friend class WriteUnpreparedTxnDB; + friend class WriteUnpreparedTxn; + +#ifndef ROCKSDB_LITE + friend class ForwardIterator; +#endif + friend struct SuperVersion; + friend class CompactedDBImpl; + friend class DBTest_ConcurrentFlushWAL_Test; + friend class DBTest_MixedSlowdownOptionsStop_Test; + friend class DBCompactionTest_CompactBottomLevelFilesWithDeletions_Test; + friend class DBCompactionTest_CompactionDuringShutdown_Test; + friend class StatsHistoryTest_PersistentStatsCreateColumnFamilies_Test; +#ifndef NDEBUG + friend class DBTest2_ReadCallbackTest_Test; + friend class WriteCallbackTest_WriteWithCallbackTest_Test; + friend class XFTransactionWriteHandler; + friend class DBBlobIndexTest; + friend class WriteUnpreparedTransactionTest_RecoveryTest_Test; +#endif + + struct CompactionState; + struct PrepickedCompaction; + struct PurgeFileInfo; + + struct WriteContext { + SuperVersionContext superversion_context; + autovector memtables_to_free_; + + explicit WriteContext(bool create_superversion = false) + : superversion_context(create_superversion) {} + + ~WriteContext() { + superversion_context.Clean(); + for (auto& m : memtables_to_free_) { + delete m; + } + } + }; + + struct LogFileNumberSize { + explicit LogFileNumberSize(uint64_t _number) : number(_number) {} + void AddSize(uint64_t new_size) { size += new_size; } + uint64_t number; + uint64_t size = 0; + bool getting_flushed = false; + }; + + struct LogWriterNumber { + // pass ownership of _writer + LogWriterNumber(uint64_t _number, log::Writer* _writer) + : number(_number), writer(_writer) {} + + log::Writer* ReleaseWriter() { + auto* w = writer; + writer = nullptr; + return w; + } + Status ClearWriter() { + Status s = writer->WriteBuffer(); + delete writer; + writer = nullptr; + return s; + } + + uint64_t number; + // Visual Studio doesn't support deque's member to be noncopyable because + // of a std::unique_ptr as a member. + log::Writer* writer; // own + // true for some prefix of logs_ + bool getting_synced = false; + }; + + // PurgeFileInfo is a structure to hold information of files to be deleted in + // purge_files_ + struct PurgeFileInfo { + std::string fname; + std::string dir_to_sync; + FileType type; + uint64_t number; + int job_id; + PurgeFileInfo(std::string fn, std::string d, FileType t, uint64_t num, + int jid) + : fname(fn), dir_to_sync(d), type(t), number(num), job_id(jid) {} + }; + + // Argument required by background flush thread. + struct BGFlushArg { + BGFlushArg() + : cfd_(nullptr), max_memtable_id_(0), superversion_context_(nullptr) {} + BGFlushArg(ColumnFamilyData* cfd, uint64_t max_memtable_id, + SuperVersionContext* superversion_context) + : cfd_(cfd), + max_memtable_id_(max_memtable_id), + superversion_context_(superversion_context) {} + + // Column family to flush. + ColumnFamilyData* cfd_; + // Maximum ID of memtable to flush. In this column family, memtables with + // IDs smaller than this value must be flushed before this flush completes. + uint64_t max_memtable_id_; + // Pointer to a SuperVersionContext object. After flush completes, RocksDB + // installs a new superversion for the column family. This operation + // requires a SuperVersionContext object (currently embedded in JobContext). + SuperVersionContext* superversion_context_; + }; + + // Argument passed to flush thread. + struct FlushThreadArg { + DBImpl* db_; + + Env::Priority thread_pri_; + }; + + // Information for a manual compaction + struct ManualCompactionState { + ColumnFamilyData* cfd; + int input_level; + int output_level; + uint32_t output_path_id; + Status status; + bool done; + bool in_progress; // compaction request being processed? + bool incomplete; // only part of requested range compacted + bool exclusive; // current behavior of only one manual + bool disallow_trivial_move; // Force actual compaction to run + const InternalKey* begin; // nullptr means beginning of key range + const InternalKey* end; // nullptr means end of key range + InternalKey* manual_end; // how far we are compacting + InternalKey tmp_storage; // Used to keep track of compaction progress + InternalKey tmp_storage1; // Used to keep track of compaction progress + }; + struct PrepickedCompaction { + // background compaction takes ownership of `compaction`. + Compaction* compaction; + // caller retains ownership of `manual_compaction_state` as it is reused + // across background compactions. + ManualCompactionState* manual_compaction_state; // nullptr if non-manual + // task limiter token is requested during compaction picking. + std::unique_ptr task_token; + }; + + struct CompactionArg { + // caller retains ownership of `db`. + DBImpl* db; + // background compaction takes ownership of `prepicked_compaction`. + PrepickedCompaction* prepicked_compaction; + }; + + // Initialize the built-in column family for persistent stats. Depending on + // whether on-disk persistent stats have been enabled before, it may either + // create a new column family and column family handle or just a column family + // handle. + // Required: DB mutex held + Status InitPersistStatsColumnFamily(); + + // Persistent Stats column family has two format version key which are used + // for compatibility check. Write format version if it's created for the + // first time, read format version and check compatibility if recovering + // from disk. This function requires DB mutex held at entrance but may + // release and re-acquire DB mutex in the process. + // Required: DB mutex held + Status PersistentStatsProcessFormatVersion(); + + Status ResumeImpl(); + + void MaybeIgnoreError(Status* s) const; + + const Status CreateArchivalDirectory(); + + Status CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options, + const std::string& cf_name, + ColumnFamilyHandle** handle); + + Status DropColumnFamilyImpl(ColumnFamilyHandle* column_family); + + // Delete any unneeded files and stale in-memory entries. + void DeleteObsoleteFiles(); + // Delete obsolete files and log status and information of file deletion + void DeleteObsoleteFileImpl(int job_id, const std::string& fname, + const std::string& path_to_sync, FileType type, + uint64_t number); + + // Background process needs to call + // auto x = CaptureCurrentFileNumberInPendingOutputs() + // auto file_num = versions_->NewFileNumber(); + // + // ReleaseFileNumberFromPendingOutputs(x) + // This will protect any file with number `file_num` or greater from being + // deleted while is running. + // ----------- + // This function will capture current file number and append it to + // pending_outputs_. This will prevent any background process to delete any + // file created after this point. + std::list::iterator CaptureCurrentFileNumberInPendingOutputs(); + // This function should be called with the result of + // CaptureCurrentFileNumberInPendingOutputs(). It then marks that any file + // created between the calls CaptureCurrentFileNumberInPendingOutputs() and + // ReleaseFileNumberFromPendingOutputs() can now be deleted (if it's not live + // and blocked by any other pending_outputs_ calls) + void ReleaseFileNumberFromPendingOutputs( + std::unique_ptr::iterator>& v); + + Status SyncClosedLogs(JobContext* job_context); + + // Flush the in-memory write buffer to storage. Switches to a new + // log-file/memtable and writes a new descriptor iff successful. Then + // installs a new super version for the column family. + Status FlushMemTableToOutputFile( + ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, + bool* madeProgress, JobContext* job_context, + SuperVersionContext* superversion_context, + std::vector& snapshot_seqs, + SequenceNumber earliest_write_conflict_snapshot, + SnapshotChecker* snapshot_checker, LogBuffer* log_buffer, + Env::Priority thread_pri); + + // Flush the memtables of (multiple) column families to multiple files on + // persistent storage. + Status FlushMemTablesToOutputFiles( + const autovector& bg_flush_args, bool* made_progress, + JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri); + + Status AtomicFlushMemTablesToOutputFiles( + const autovector& bg_flush_args, bool* made_progress, + JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri); + + // REQUIRES: log_numbers are sorted in ascending order + // corrupted_log_found is set to true if we recover from a corrupted log file. + Status RecoverLogFiles(const std::vector& log_numbers, + SequenceNumber* next_sequence, bool read_only, + bool* corrupted_log_found); + + // The following two methods are used to flush a memtable to + // storage. The first one is used at database RecoveryTime (when the + // database is opened) and is heavyweight because it holds the mutex + // for the entire period. The second method WriteLevel0Table supports + // concurrent flush memtables to storage. + Status WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, + MemTable* mem, VersionEdit* edit); + + // Restore alive_log_files_ and total_log_size_ after recovery. + // It needs to run only when there's no flush during recovery + // (e.g. avoid_flush_during_recovery=true). May also trigger flush + // in case total_log_size > max_total_wal_size. + Status RestoreAliveLogFiles(const std::vector& log_numbers); + + // num_bytes: for slowdown case, delay time is calculated based on + // `num_bytes` going through. + Status DelayWrite(uint64_t num_bytes, const WriteOptions& write_options); + + Status ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options, + WriteBatch* my_batch); + + // REQUIRES: mutex locked and in write thread. + Status ScheduleFlushes(WriteContext* context); + + void MaybeFlushStatsCF(autovector* cfds); + + Status TrimMemtableHistory(WriteContext* context); + + Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context); + + void SelectColumnFamiliesForAtomicFlush(autovector* cfds); + + // Force current memtable contents to be flushed. + Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options, + FlushReason flush_reason, bool writes_stopped = false); + + Status AtomicFlushMemTables( + const autovector& column_family_datas, + const FlushOptions& options, FlushReason flush_reason, + bool writes_stopped = false); + + // Wait until flushing this column family won't stall writes + Status WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd, + bool* flush_needed); + + // Wait for memtable flushed. + // If flush_memtable_id is non-null, wait until the memtable with the ID + // gets flush. Otherwise, wait until the column family don't have any + // memtable pending flush. + // resuming_from_bg_err indicates whether the caller is attempting to resume + // from background error. + Status WaitForFlushMemTable(ColumnFamilyData* cfd, + const uint64_t* flush_memtable_id = nullptr, + bool resuming_from_bg_err = false) { + return WaitForFlushMemTables({cfd}, {flush_memtable_id}, + resuming_from_bg_err); + } + // Wait for memtables to be flushed for multiple column families. + Status WaitForFlushMemTables( + const autovector& cfds, + const autovector& flush_memtable_ids, + bool resuming_from_bg_err); + + inline void WaitForPendingWrites() { + mutex_.AssertHeld(); + TEST_SYNC_POINT("DBImpl::WaitForPendingWrites:BeforeBlock"); + // In case of pipelined write is enabled, wait for all pending memtable + // writers. + if (immutable_db_options_.enable_pipelined_write) { + // Memtable writers may call DB::Get in case max_successive_merges > 0, + // which may lock mutex. Unlocking mutex here to avoid deadlock. + mutex_.Unlock(); + write_thread_.WaitForMemTableWriters(); + mutex_.Lock(); + } + + if (!immutable_db_options_.unordered_write) { + // Then the writes are finished before the next write group starts + return; + } + + // Wait for the ones who already wrote to the WAL to finish their + // memtable write. + if (pending_memtable_writes_.load() != 0) { + std::unique_lock guard(switch_mutex_); + switch_cv_.wait(guard, + [&] { return pending_memtable_writes_.load() == 0; }); + } + } + + // REQUIRES: mutex locked and in write thread. + void AssignAtomicFlushSeq(const autovector& cfds); + + // REQUIRES: mutex locked and in write thread. + Status SwitchWAL(WriteContext* write_context); + + // REQUIRES: mutex locked and in write thread. + Status HandleWriteBufferFull(WriteContext* write_context); + + // REQUIRES: mutex locked + Status PreprocessWrite(const WriteOptions& write_options, bool* need_log_sync, + WriteContext* write_context); + + WriteBatch* MergeBatch(const WriteThread::WriteGroup& write_group, + WriteBatch* tmp_batch, size_t* write_with_wal, + WriteBatch** to_be_cached_state); + + Status WriteToWAL(const WriteBatch& merged_batch, log::Writer* log_writer, + uint64_t* log_used, uint64_t* log_size); + + Status WriteToWAL(const WriteThread::WriteGroup& write_group, + log::Writer* log_writer, uint64_t* log_used, + bool need_log_sync, bool need_log_dir_sync, + SequenceNumber sequence); + + Status ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group, + uint64_t* log_used, SequenceNumber* last_sequence, + size_t seq_inc); + + // Used by WriteImpl to update bg_error_ if paranoid check is enabled. + void WriteStatusCheck(const Status& status); + + // Used by WriteImpl to update bg_error_ in case of memtable insert error. + void MemTableInsertStatusCheck(const Status& memtable_insert_status); + +#ifndef ROCKSDB_LITE + + Status CompactFilesImpl(const CompactionOptions& compact_options, + ColumnFamilyData* cfd, Version* version, + const std::vector& input_file_names, + std::vector* const output_file_names, + const int output_level, int output_path_id, + JobContext* job_context, LogBuffer* log_buffer, + CompactionJobInfo* compaction_job_info); + + // Wait for current IngestExternalFile() calls to finish. + // REQUIRES: mutex_ held + void WaitForIngestFile(); + +#else + // IngestExternalFile is not supported in ROCKSDB_LITE so this function + // will be no-op + void WaitForIngestFile() {} +#endif // ROCKSDB_LITE + + ColumnFamilyData* GetColumnFamilyDataByName(const std::string& cf_name); + + void MaybeScheduleFlushOrCompaction(); + + // A flush request specifies the column families to flush as well as the + // largest memtable id to persist for each column family. Once all the + // memtables whose IDs are smaller than or equal to this per-column-family + // specified value, this flush request is considered to have completed its + // work of flushing this column family. After completing the work for all + // column families in this request, this flush is considered complete. + typedef std::vector> FlushRequest; + + void GenerateFlushRequest(const autovector& cfds, + FlushRequest* req); + + void SchedulePendingFlush(const FlushRequest& req, FlushReason flush_reason); + + void SchedulePendingCompaction(ColumnFamilyData* cfd); + void SchedulePendingPurge(std::string fname, std::string dir_to_sync, + FileType type, uint64_t number, int job_id); + static void BGWorkCompaction(void* arg); + // Runs a pre-chosen universal compaction involving bottom level in a + // separate, bottom-pri thread pool. + static void BGWorkBottomCompaction(void* arg); + static void BGWorkFlush(void* arg); + static void BGWorkPurge(void* arg); + static void UnscheduleCompactionCallback(void* arg); + static void UnscheduleFlushCallback(void* arg); + void BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction, + Env::Priority thread_pri); + void BackgroundCallFlush(Env::Priority thread_pri); + void BackgroundCallPurge(); + Status BackgroundCompaction(bool* madeProgress, JobContext* job_context, + LogBuffer* log_buffer, + PrepickedCompaction* prepicked_compaction, + Env::Priority thread_pri); + Status BackgroundFlush(bool* madeProgress, JobContext* job_context, + LogBuffer* log_buffer, FlushReason* reason, + Env::Priority thread_pri); + + bool EnoughRoomForCompaction(ColumnFamilyData* cfd, + const std::vector& inputs, + bool* sfm_bookkeeping, LogBuffer* log_buffer); + + // Request compaction tasks token from compaction thread limiter. + // It always succeeds if force = true or limiter is disable. + bool RequestCompactionToken(ColumnFamilyData* cfd, bool force, + std::unique_ptr* token, + LogBuffer* log_buffer); + + // Schedule background tasks + void StartTimedTasks(); + + void PrintStatistics(); + + size_t EstimateInMemoryStatsHistorySize() const; + + // persist stats to column family "_persistent_stats" + void PersistStats(); + + // dump rocksdb.stats to LOG + void DumpStats(); + + // Return the minimum empty level that could hold the total data in the + // input level. Return the input level, if such level could not be found. + int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, + const MutableCFOptions& mutable_cf_options, + int level); + + // Move the files in the input level to the target level. + // If target_level < 0, automatically calculate the minimum level that could + // hold the data set. + Status ReFitLevel(ColumnFamilyData* cfd, int level, int target_level = -1); + + // helper functions for adding and removing from flush & compaction queues + void AddToCompactionQueue(ColumnFamilyData* cfd); + ColumnFamilyData* PopFirstFromCompactionQueue(); + FlushRequest PopFirstFromFlushQueue(); + + // Pick the first unthrottled compaction with task token from queue. + ColumnFamilyData* PickCompactionFromQueue( + std::unique_ptr* token, LogBuffer* log_buffer); + + // helper function to call after some of the logs_ were synced + void MarkLogsSynced(uint64_t up_to, bool synced_dir, const Status& status); + + SnapshotImpl* GetSnapshotImpl(bool is_write_conflict_boundary, + bool lock = true); + + uint64_t GetMaxTotalWalSize() const; + + Directory* GetDataDir(ColumnFamilyData* cfd, size_t path_id) const; + + Status CloseHelper(); + + void WaitForBackgroundWork(); + + // Background threads call this function, which is just a wrapper around + // the InstallSuperVersion() function. Background threads carry + // sv_context which can have new_superversion already + // allocated. + // All ColumnFamily state changes go through this function. Here we analyze + // the new state and we schedule background work if we detect that the new + // state needs flush or compaction. + void InstallSuperVersionAndScheduleWork( + ColumnFamilyData* cfd, SuperVersionContext* sv_context, + const MutableCFOptions& mutable_cf_options); + + bool GetIntPropertyInternal(ColumnFamilyData* cfd, + const DBPropertyInfo& property_info, + bool is_locked, uint64_t* value); + bool GetPropertyHandleOptionsStatistics(std::string* value); + + bool HasPendingManualCompaction(); + bool HasExclusiveManualCompaction(); + void AddManualCompaction(ManualCompactionState* m); + void RemoveManualCompaction(ManualCompactionState* m); + bool ShouldntRunManualCompaction(ManualCompactionState* m); + bool HaveManualCompaction(ColumnFamilyData* cfd); + bool MCOverlap(ManualCompactionState* m, ManualCompactionState* m1); +#ifndef ROCKSDB_LITE + void BuildCompactionJobInfo(const ColumnFamilyData* cfd, Compaction* c, + const Status& st, + const CompactionJobStats& compaction_job_stats, + const int job_id, const Version* current, + CompactionJobInfo* compaction_job_info) const; + // Reserve the next 'num' file numbers for to-be-ingested external SST files, + // and return the current file_number in 'next_file_number'. + // Write a version edit to the MANIFEST. + Status ReserveFileNumbersBeforeIngestion( + ColumnFamilyData* cfd, uint64_t num, + std::unique_ptr::iterator>& pending_output_elem, + uint64_t* next_file_number); +#endif //! ROCKSDB_LITE + + bool ShouldPurge(uint64_t file_number) const; + void MarkAsGrabbedForPurge(uint64_t file_number); + + size_t GetWalPreallocateBlockSize(uint64_t write_buffer_size) const; + Env::WriteLifeTimeHint CalculateWALWriteHint() { return Env::WLTH_SHORT; } + + Status CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number, + size_t preallocate_block_size, log::Writer** new_log); + + // Validate self-consistency of DB options + static Status ValidateOptions(const DBOptions& db_options); + // Validate self-consistency of DB options and its consistency with cf options + static Status ValidateOptions( + const DBOptions& db_options, + const std::vector& column_families); + + // Utility function to do some debug validation and sort the given vector + // of MultiGet keys + void PrepareMultiGetKeys( + const size_t num_keys, bool sorted, + autovector* key_ptrs); + + // A structure to hold the information required to process MultiGet of keys + // belonging to one column family. For a multi column family MultiGet, there + // will be a container of these objects. + struct MultiGetColumnFamilyData { + ColumnFamilyHandle* cf; + ColumnFamilyData* cfd; + + // For the batched MultiGet which relies on sorted keys, start specifies + // the index of first key belonging to this column family in the sorted + // list. + size_t start; + + // For the batched MultiGet case, num_keys specifies the number of keys + // belonging to this column family in the sorted list + size_t num_keys; + + // SuperVersion for the column family obtained in a manner that ensures a + // consistent view across all column families in the DB + SuperVersion* super_version; + MultiGetColumnFamilyData(ColumnFamilyHandle* column_family, + SuperVersion* sv) + : cf(column_family), + cfd(static_cast(cf)->cfd()), + start(0), + num_keys(0), + super_version(sv) {} + + MultiGetColumnFamilyData(ColumnFamilyHandle* column_family, size_t first, + size_t count, SuperVersion* sv) + : cf(column_family), + cfd(static_cast(cf)->cfd()), + start(first), + num_keys(count), + super_version(sv) {} + + MultiGetColumnFamilyData() = default; + }; + + // A common function to obtain a consistent snapshot, which can be implicit + // if the user doesn't specify a snapshot in read_options, across + // multiple column families for MultiGet. It will attempt to get an implicit + // snapshot without acquiring the db_mutes, but will give up after a few + // tries and acquire the mutex if a memtable flush happens. The template + // allows both the batched and non-batched MultiGet to call this with + // either an std::unordered_map or autovector of column families. + // + // If callback is non-null, the callback is refreshed with the snapshot + // sequence number + // + // A return value of true indicates that the SuperVersions were obtained + // from the ColumnFamilyData, whereas false indicates they are thread + // local + template + bool MultiCFSnapshot( + const ReadOptions& read_options, ReadCallback* callback, + std::function& + iter_deref_func, + T* cf_list, SequenceNumber* snapshot); + + // The actual implementation of the batching MultiGet. The caller is expected + // to have acquired the SuperVersion and pass in a snapshot sequence number + // in order to construct the LookupKeys. The start_key and num_keys specify + // the range of keys in the sorted_keys vector for a single column family. + void MultiGetImpl( + const ReadOptions& read_options, size_t start_key, size_t num_keys, + autovector* sorted_keys, + SuperVersion* sv, SequenceNumber snap_seqnum, ReadCallback* callback, + bool* is_blob_index); + + // table_cache_ provides its own synchronization + std::shared_ptr table_cache_; + + // Lock over the persistent DB state. Non-nullptr iff successfully acquired. + FileLock* db_lock_; + + // In addition to mutex_, log_write_mutex_ protected writes to stats_history_ + InstrumentedMutex stats_history_mutex_; + // In addition to mutex_, log_write_mutex_ protected writes to logs_ and + // logfile_number_. With two_write_queues it also protects alive_log_files_, + // and log_empty_. Refer to the definition of each variable below for more + // details. + // Note: to avoid dealock, if needed to acquire both log_write_mutex_ and + // mutex_, the order should be first mutex_ and then log_write_mutex_. + InstrumentedMutex log_write_mutex_; + + std::atomic shutting_down_; + std::atomic manual_compaction_paused_; + // This condition variable is signaled on these conditions: + // * whenever bg_compaction_scheduled_ goes down to 0 + // * if AnyManualCompaction, whenever a compaction finishes, even if it hasn't + // made any progress + // * whenever a compaction made any progress + // * whenever bg_flush_scheduled_ or bg_purge_scheduled_ value decreases + // (i.e. whenever a flush is done, even if it didn't make any progress) + // * whenever there is an error in background purge, flush or compaction + // * whenever num_running_ingest_file_ goes to 0. + // * whenever pending_purge_obsolete_files_ goes to 0. + // * whenever disable_delete_obsolete_files_ goes to 0. + // * whenever SetOptions successfully updates options. + // * whenever a column family is dropped. + InstrumentedCondVar bg_cv_; + // Writes are protected by locking both mutex_ and log_write_mutex_, and reads + // must be under either mutex_ or log_write_mutex_. Since after ::Open, + // logfile_number_ is currently updated only in write_thread_, it can be read + // from the same write_thread_ without any locks. + uint64_t logfile_number_; + std::deque + log_recycle_files_; // a list of log files that we can recycle + bool log_dir_synced_; + // Without two_write_queues, read and writes to log_empty_ are protected by + // mutex_. Since it is currently updated/read only in write_thread_, it can be + // accessed from the same write_thread_ without any locks. With + // two_write_queues writes, where it can be updated in different threads, + // read and writes are protected by log_write_mutex_ instead. This is to avoid + // expesnive mutex_ lock during WAL write, which update log_empty_. + bool log_empty_; + + ColumnFamilyHandleImpl* persist_stats_cf_handle_; + + bool persistent_stats_cfd_exists_ = true; + + // Without two_write_queues, read and writes to alive_log_files_ are + // protected by mutex_. However since back() is never popped, and push_back() + // is done only from write_thread_, the same thread can access the item + // reffered by back() without mutex_. With two_write_queues_, writes + // are protected by locking both mutex_ and log_write_mutex_, and reads must + // be under either mutex_ or log_write_mutex_. + std::deque alive_log_files_; + // Log files that aren't fully synced, and the current log file. + // Synchronization: + // - push_back() is done from write_thread_ with locked mutex_ and + // log_write_mutex_ + // - pop_front() is done from any thread with locked mutex_ and + // log_write_mutex_ + // - reads are done with either locked mutex_ or log_write_mutex_ + // - back() and items with getting_synced=true are not popped, + // - The same thread that sets getting_synced=true will reset it. + // - it follows that the object referred by back() can be safely read from + // the write_thread_ without using mutex + // - it follows that the items with getting_synced=true can be safely read + // from the same thread that has set getting_synced=true + std::deque logs_; + // Signaled when getting_synced becomes false for some of the logs_. + InstrumentedCondVar log_sync_cv_; + // This is the app-level state that is written to the WAL but will be used + // only during recovery. Using this feature enables not writing the state to + // memtable on normal writes and hence improving the throughput. Each new + // write of the state will replace the previous state entirely even if the + // keys in the two consecuitive states do not overlap. + // It is protected by log_write_mutex_ when two_write_queues_ is enabled. + // Otherwise only the heaad of write_thread_ can access it. + WriteBatch cached_recoverable_state_; + std::atomic cached_recoverable_state_empty_ = {true}; + std::atomic total_log_size_; + + // If this is non-empty, we need to delete these log files in background + // threads. Protected by db mutex. + autovector logs_to_free_; + + bool is_snapshot_supported_; + + std::map> stats_history_; + + std::map stats_slice_; + + bool stats_slice_initialized_ = false; + + Directories directories_; + + WriteBufferManager* write_buffer_manager_; + + WriteThread write_thread_; + WriteBatch tmp_batch_; + // The write thread when the writers have no memtable write. This will be used + // in 2PC to batch the prepares separately from the serial commit. + WriteThread nonmem_write_thread_; + + WriteController write_controller_; + + // Size of the last batch group. In slowdown mode, next write needs to + // sleep if it uses up the quota. + // Note: This is to protect memtable and compaction. If the batch only writes + // to the WAL its size need not to be included in this. + uint64_t last_batch_group_size_; + + FlushScheduler flush_scheduler_; + + TrimHistoryScheduler trim_history_scheduler_; + + SnapshotList snapshots_; + + // For each background job, pending_outputs_ keeps the current file number at + // the time that background job started. + // FindObsoleteFiles()/PurgeObsoleteFiles() never deletes any file that has + // number bigger than any of the file number in pending_outputs_. Since file + // numbers grow monotonically, this also means that pending_outputs_ is always + // sorted. After a background job is done executing, its file number is + // deleted from pending_outputs_, which allows PurgeObsoleteFiles() to clean + // it up. + // State is protected with db mutex. + std::list pending_outputs_; + + // flush_queue_ and compaction_queue_ hold column families that we need to + // flush and compact, respectively. + // A column family is inserted into flush_queue_ when it satisfies condition + // cfd->imm()->IsFlushPending() + // A column family is inserted into compaction_queue_ when it satisfied + // condition cfd->NeedsCompaction() + // Column families in this list are all Ref()-erenced + // TODO(icanadi) Provide some kind of ReferencedColumnFamily class that will + // do RAII on ColumnFamilyData + // Column families are in this queue when they need to be flushed or + // compacted. Consumers of these queues are flush and compaction threads. When + // column family is put on this queue, we increase unscheduled_flushes_ and + // unscheduled_compactions_. When these variables are bigger than zero, that + // means we need to schedule background threads for flush and compaction. + // Once the background threads are scheduled, we decrease unscheduled_flushes_ + // and unscheduled_compactions_. That way we keep track of number of + // compaction and flush threads we need to schedule. This scheduling is done + // in MaybeScheduleFlushOrCompaction() + // invariant(column family present in flush_queue_ <==> + // ColumnFamilyData::pending_flush_ == true) + std::deque flush_queue_; + // invariant(column family present in compaction_queue_ <==> + // ColumnFamilyData::pending_compaction_ == true) + std::deque compaction_queue_; + + // A map to store file numbers and filenames of the files to be purged + std::unordered_map purge_files_; + + // A vector to store the file numbers that have been assigned to certain + // JobContext. Current implementation tracks ssts only. + std::unordered_set files_grabbed_for_purge_; + + // A queue to store log writers to close + std::deque logs_to_free_queue_; + std::deque superversions_to_free_queue_; + int unscheduled_flushes_; + int unscheduled_compactions_; + + // count how many background compactions are running or have been scheduled in + // the BOTTOM pool + int bg_bottom_compaction_scheduled_; + + // count how many background compactions are running or have been scheduled + int bg_compaction_scheduled_; + + // stores the number of compactions are currently running + int num_running_compactions_; + + // number of background memtable flush jobs, submitted to the HIGH pool + int bg_flush_scheduled_; + + // stores the number of flushes are currently running + int num_running_flushes_; + + // number of background obsolete file purge jobs, submitted to the HIGH pool + int bg_purge_scheduled_; + + std::deque manual_compaction_dequeue_; + + // shall we disable deletion of obsolete files + // if 0 the deletion is enabled. + // if non-zero, files will not be getting deleted + // This enables two different threads to call + // EnableFileDeletions() and DisableFileDeletions() + // without any synchronization + int disable_delete_obsolete_files_; + + // Number of times FindObsoleteFiles has found deletable files and the + // corresponding call to PurgeObsoleteFiles has not yet finished. + int pending_purge_obsolete_files_; + + // last time when DeleteObsoleteFiles with full scan was executed. Originally + // initialized with startup time. + uint64_t delete_obsolete_files_last_run_; + + // last time stats were dumped to LOG + std::atomic last_stats_dump_time_microsec_; + + // The thread that wants to switch memtable, can wait on this cv until the + // pending writes to memtable finishes. + std::condition_variable switch_cv_; + // The mutex used by switch_cv_. mutex_ should be acquired beforehand. + std::mutex switch_mutex_; + // Number of threads intending to write to memtable + std::atomic pending_memtable_writes_ = {}; + + // Each flush or compaction gets its own job id. this counter makes sure + // they're unique + std::atomic next_job_id_; + + // A flag indicating whether the current rocksdb database has any + // data that is not yet persisted into either WAL or SST file. + // Used when disableWAL is true. + std::atomic has_unpersisted_data_; + + // if an attempt was made to flush all column families that + // the oldest log depends on but uncommitted data in the oldest + // log prevents the log from being released. + // We must attempt to free the dependent memtables again + // at a later time after the transaction in the oldest + // log is fully commited. + bool unable_to_release_oldest_log_; + + static const int KEEP_LOG_FILE_NUM = 1000; + // MSVC version 1800 still does not have constexpr for ::max() + static const uint64_t kNoTimeOut = port::kMaxUint64; + + std::string db_absolute_path_; + + // Number of running IngestExternalFile() or CreateColumnFamilyWithImport() + // calls. + // REQUIRES: mutex held + int num_running_ingest_file_; + +#ifndef ROCKSDB_LITE + WalManager wal_manager_; +#endif // ROCKSDB_LITE + + // Unified interface for logging events + EventLogger event_logger_; + + // A value of > 0 temporarily disables scheduling of background work + int bg_work_paused_; + + // A value of > 0 temporarily disables scheduling of background compaction + int bg_compaction_paused_; + + // Guard against multiple concurrent refitting + bool refitting_level_; + + // Indicate DB was opened successfully + bool opened_successfully_; + + // The min threshold to triggere bottommost compaction for removing + // garbages, among all column families. + SequenceNumber bottommost_files_mark_threshold_ = kMaxSequenceNumber; + + LogsWithPrepTracker logs_with_prep_tracker_; + + // Callback for compaction to check if a key is visible to a snapshot. + // REQUIRES: mutex held + std::unique_ptr snapshot_checker_; + + // Callback for when the cached_recoverable_state_ is written to memtable + // Only to be set during initialization + std::unique_ptr recoverable_state_pre_release_callback_; + + // handle for scheduling stats dumping at fixed intervals + // REQUIRES: mutex locked + std::unique_ptr thread_dump_stats_; + + // handle for scheduling stats snapshoting at fixed intervals + // REQUIRES: mutex locked + std::unique_ptr thread_persist_stats_; + + // When set, we use a separate queue for writes that dont write to memtable. + // In 2PC these are the writes at Prepare phase. + const bool two_write_queues_; + const bool manual_wal_flush_; + + // LastSequence also indicates last published sequence visibile to the + // readers. Otherwise LastPublishedSequence should be used. + const bool last_seq_same_as_publish_seq_; + // It indicates that a customized gc algorithm must be used for + // flush/compaction and if it is not provided vis SnapshotChecker, we should + // disable gc to be safe. + const bool use_custom_gc_; + // Flag to indicate that the DB instance shutdown has been initiated. This + // different from shutting_down_ atomic in that it is set at the beginning + // of shutdown sequence, specifically in order to prevent any background + // error recovery from going on in parallel. The latter, shutting_down_, + // is set a little later during the shutdown after scheduling memtable + // flushes + std::atomic shutdown_initiated_; + // Flag to indicate whether sst_file_manager object was allocated in + // DB::Open() or passed to us + bool own_sfm_; + + // Clients must periodically call SetPreserveDeletesSequenceNumber() + // to advance this seqnum. Default value is 0 which means ALL deletes are + // preserved. Note that this has no effect if DBOptions.preserve_deletes + // is set to false. + std::atomic preserve_deletes_seqnum_; + const bool preserve_deletes_; + + // Flag to check whether Close() has been called on this DB + bool closed_; + + ErrorHandler error_handler_; + + // Conditional variable to coordinate installation of atomic flush results. + // With atomic flush, each bg thread installs the result of flushing multiple + // column families, and different threads can flush different column + // families. It's difficult to rely on one thread to perform batch + // installation for all threads. This is different from the non-atomic flush + // case. + // atomic_flush_install_cv_ makes sure that threads install atomic flush + // results sequentially. Flush results of memtables with lower IDs get + // installed to MANIFEST first. + InstrumentedCondVar atomic_flush_install_cv_; + + bool wal_in_db_path_; +}; + +extern Options SanitizeOptions(const std::string& db, const Options& src); + +extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src); + +extern CompressionType GetCompressionFlush( + const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options); + +// Return the earliest log file to keep after the memtable flush is +// finalized. +// `cfd_to_flush` is the column family whose memtable (specified in +// `memtables_to_flush`) will be flushed and thus will not depend on any WAL +// file. +// The function is only applicable to 2pc mode. +extern uint64_t PrecomputeMinLogNumberToKeep( + VersionSet* vset, const ColumnFamilyData& cfd_to_flush, + autovector edit_list, + const autovector& memtables_to_flush, + LogsWithPrepTracker* prep_tracker); + +// `cfd_to_flush` is the column family whose memtable will be flushed and thus +// will not depend on any WAL file. nullptr means no memtable is being flushed. +// The function is only applicable to 2pc mode. +extern uint64_t FindMinPrepLogReferencedByMemTable( + VersionSet* vset, const ColumnFamilyData* cfd_to_flush, + const autovector& memtables_to_flush); + +// Fix user-supplied options to be reasonable +template +static void ClipToRange(T* ptr, V minvalue, V maxvalue) { + if (static_cast(*ptr) > maxvalue) *ptr = maxvalue; + if (static_cast(*ptr) < minvalue) *ptr = minvalue; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_impl/db_impl_compaction_flush.cc b/src/rocksdb/db/db_impl/db_impl_compaction_flush.cc new file mode 100644 index 000000000..c7b3510c3 --- /dev/null +++ b/src/rocksdb/db/db_impl/db_impl_compaction_flush.cc @@ -0,0 +1,3116 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "db/db_impl/db_impl.h" + +#include + +#include "db/builder.h" +#include "db/error_handler.h" +#include "db/event_helpers.h" +#include "file/sst_file_manager_impl.h" +#include "monitoring/iostats_context_imp.h" +#include "monitoring/perf_context_imp.h" +#include "monitoring/thread_status_updater.h" +#include "monitoring/thread_status_util.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" +#include "util/concurrent_task_limiter_impl.h" + +namespace ROCKSDB_NAMESPACE { + +bool DBImpl::EnoughRoomForCompaction( + ColumnFamilyData* cfd, const std::vector& inputs, + bool* sfm_reserved_compact_space, LogBuffer* log_buffer) { + // Check if we have enough room to do the compaction + bool enough_room = true; +#ifndef ROCKSDB_LITE + auto sfm = static_cast( + immutable_db_options_.sst_file_manager.get()); + if (sfm) { + // Pass the current bg_error_ to SFM so it can decide what checks to + // perform. If this DB instance hasn't seen any error yet, the SFM can be + // optimistic and not do disk space checks + enough_room = + sfm->EnoughRoomForCompaction(cfd, inputs, error_handler_.GetBGError()); + if (enough_room) { + *sfm_reserved_compact_space = true; + } + } +#else + (void)cfd; + (void)inputs; + (void)sfm_reserved_compact_space; +#endif // ROCKSDB_LITE + if (!enough_room) { + // Just in case tests want to change the value of enough_room + TEST_SYNC_POINT_CALLBACK( + "DBImpl::BackgroundCompaction():CancelledCompaction", &enough_room); + ROCKS_LOG_BUFFER(log_buffer, + "Cancelled compaction because not enough room"); + RecordTick(stats_, COMPACTION_CANCELLED, 1); + } + return enough_room; +} + +bool DBImpl::RequestCompactionToken(ColumnFamilyData* cfd, bool force, + std::unique_ptr* token, + LogBuffer* log_buffer) { + assert(*token == nullptr); + auto limiter = static_cast( + cfd->ioptions()->compaction_thread_limiter.get()); + if (limiter == nullptr) { + return true; + } + *token = limiter->GetToken(force); + if (*token != nullptr) { + ROCKS_LOG_BUFFER(log_buffer, + "Thread limiter [%s] increase [%s] compaction task, " + "force: %s, tasks after: %d", + limiter->GetName().c_str(), cfd->GetName().c_str(), + force ? "true" : "false", limiter->GetOutstandingTask()); + return true; + } + return false; +} + +Status DBImpl::SyncClosedLogs(JobContext* job_context) { + TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Start"); + mutex_.AssertHeld(); + autovector logs_to_sync; + uint64_t current_log_number = logfile_number_; + while (logs_.front().number < current_log_number && + logs_.front().getting_synced) { + log_sync_cv_.Wait(); + } + for (auto it = logs_.begin(); + it != logs_.end() && it->number < current_log_number; ++it) { + auto& log = *it; + assert(!log.getting_synced); + log.getting_synced = true; + logs_to_sync.push_back(log.writer); + } + + Status s; + if (!logs_to_sync.empty()) { + mutex_.Unlock(); + + for (log::Writer* log : logs_to_sync) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[JOB %d] Syncing log #%" PRIu64, job_context->job_id, + log->get_log_number()); + s = log->file()->Sync(immutable_db_options_.use_fsync); + if (!s.ok()) { + break; + } + + if (immutable_db_options_.recycle_log_file_num > 0) { + s = log->Close(); + if (!s.ok()) { + break; + } + } + } + if (s.ok()) { + s = directories_.GetWalDir()->Fsync(); + } + + mutex_.Lock(); + + // "number <= current_log_number - 1" is equivalent to + // "number < current_log_number". + MarkLogsSynced(current_log_number - 1, true, s); + if (!s.ok()) { + error_handler_.SetBGError(s, BackgroundErrorReason::kFlush); + TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Failed"); + return s; + } + } + return s; +} + +Status DBImpl::FlushMemTableToOutputFile( + ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, + bool* made_progress, JobContext* job_context, + SuperVersionContext* superversion_context, + std::vector& snapshot_seqs, + SequenceNumber earliest_write_conflict_snapshot, + SnapshotChecker* snapshot_checker, LogBuffer* log_buffer, + Env::Priority thread_pri) { + mutex_.AssertHeld(); + assert(cfd->imm()->NumNotFlushed() != 0); + assert(cfd->imm()->IsFlushPending()); + + FlushJob flush_job( + dbname_, cfd, immutable_db_options_, mutable_cf_options, + nullptr /* memtable_id */, file_options_for_compaction_, versions_.get(), + &mutex_, &shutting_down_, snapshot_seqs, earliest_write_conflict_snapshot, + snapshot_checker, job_context, log_buffer, directories_.GetDbDir(), + GetDataDir(cfd, 0U), + GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), stats_, + &event_logger_, mutable_cf_options.report_bg_io_stats, + true /* sync_output_directory */, true /* write_manifest */, thread_pri); + + FileMetaData file_meta; + + TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:BeforePickMemtables"); + flush_job.PickMemTable(); + TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:AfterPickMemtables"); + +#ifndef ROCKSDB_LITE + // may temporarily unlock and lock the mutex. + NotifyOnFlushBegin(cfd, &file_meta, mutable_cf_options, job_context->job_id); +#endif // ROCKSDB_LITE + + Status s; + if (logfile_number_ > 0 && + versions_->GetColumnFamilySet()->NumberOfColumnFamilies() > 1) { + // If there are more than one column families, we need to make sure that + // all the log files except the most recent one are synced. Otherwise if + // the host crashes after flushing and before WAL is persistent, the + // flushed SST may contain data from write batches whose updates to + // other column families are missing. + // SyncClosedLogs() may unlock and re-lock the db_mutex. + s = SyncClosedLogs(job_context); + } else { + TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Skip"); + } + + // Within flush_job.Run, rocksdb may call event listener to notify + // file creation and deletion. + // + // Note that flush_job.Run will unlock and lock the db_mutex, + // and EventListener callback will be called when the db_mutex + // is unlocked by the current thread. + if (s.ok()) { + s = flush_job.Run(&logs_with_prep_tracker_, &file_meta); + } else { + flush_job.Cancel(); + } + + if (s.ok()) { + InstallSuperVersionAndScheduleWork(cfd, superversion_context, + mutable_cf_options); + if (made_progress) { + *made_progress = true; + } + VersionStorageInfo::LevelSummaryStorage tmp; + ROCKS_LOG_BUFFER(log_buffer, "[%s] Level summary: %s\n", + cfd->GetName().c_str(), + cfd->current()->storage_info()->LevelSummary(&tmp)); + } + + if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()) { + Status new_bg_error = s; + error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush); + } + if (s.ok()) { +#ifndef ROCKSDB_LITE + // may temporarily unlock and lock the mutex. + NotifyOnFlushCompleted(cfd, mutable_cf_options, + flush_job.GetCommittedFlushJobsInfo()); + auto sfm = static_cast( + immutable_db_options_.sst_file_manager.get()); + if (sfm) { + // Notify sst_file_manager that a new file was added + std::string file_path = MakeTableFileName( + cfd->ioptions()->cf_paths[0].path, file_meta.fd.GetNumber()); + sfm->OnAddFile(file_path); + if (sfm->IsMaxAllowedSpaceReached()) { + Status new_bg_error = + Status::SpaceLimit("Max allowed space was reached"); + TEST_SYNC_POINT_CALLBACK( + "DBImpl::FlushMemTableToOutputFile:MaxAllowedSpaceReached", + &new_bg_error); + error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush); + } + } +#endif // ROCKSDB_LITE + } + TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:Finish"); + return s; +} + +Status DBImpl::FlushMemTablesToOutputFiles( + const autovector& bg_flush_args, bool* made_progress, + JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri) { + if (immutable_db_options_.atomic_flush) { + return AtomicFlushMemTablesToOutputFiles( + bg_flush_args, made_progress, job_context, log_buffer, thread_pri); + } + std::vector snapshot_seqs; + SequenceNumber earliest_write_conflict_snapshot; + SnapshotChecker* snapshot_checker; + GetSnapshotContext(job_context, &snapshot_seqs, + &earliest_write_conflict_snapshot, &snapshot_checker); + Status status; + for (auto& arg : bg_flush_args) { + ColumnFamilyData* cfd = arg.cfd_; + MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions(); + SuperVersionContext* superversion_context = arg.superversion_context_; + Status s = FlushMemTableToOutputFile( + cfd, mutable_cf_options, made_progress, job_context, + superversion_context, snapshot_seqs, earliest_write_conflict_snapshot, + snapshot_checker, log_buffer, thread_pri); + if (!s.ok()) { + status = s; + if (!s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()) { + // At this point, DB is not shutting down, nor is cfd dropped. + // Something is wrong, thus we break out of the loop. + break; + } + } + } + return status; +} + +/* + * Atomically flushes multiple column families. + * + * For each column family, all memtables with ID smaller than or equal to the + * ID specified in bg_flush_args will be flushed. Only after all column + * families finish flush will this function commit to MANIFEST. If any of the + * column families are not flushed successfully, this function does not have + * any side-effect on the state of the database. + */ +Status DBImpl::AtomicFlushMemTablesToOutputFiles( + const autovector& bg_flush_args, bool* made_progress, + JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri) { + mutex_.AssertHeld(); + + autovector cfds; + for (const auto& arg : bg_flush_args) { + cfds.emplace_back(arg.cfd_); + } + +#ifndef NDEBUG + for (const auto cfd : cfds) { + assert(cfd->imm()->NumNotFlushed() != 0); + assert(cfd->imm()->IsFlushPending()); + } +#endif /* !NDEBUG */ + + std::vector snapshot_seqs; + SequenceNumber earliest_write_conflict_snapshot; + SnapshotChecker* snapshot_checker; + GetSnapshotContext(job_context, &snapshot_seqs, + &earliest_write_conflict_snapshot, &snapshot_checker); + + autovector distinct_output_dirs; + autovector distinct_output_dir_paths; + std::vector> jobs; + std::vector all_mutable_cf_options; + int num_cfs = static_cast(cfds.size()); + all_mutable_cf_options.reserve(num_cfs); + for (int i = 0; i < num_cfs; ++i) { + auto cfd = cfds[i]; + Directory* data_dir = GetDataDir(cfd, 0U); + const std::string& curr_path = cfd->ioptions()->cf_paths[0].path; + + // Add to distinct output directories if eligible. Use linear search. Since + // the number of elements in the vector is not large, performance should be + // tolerable. + bool found = false; + for (const auto& path : distinct_output_dir_paths) { + if (path == curr_path) { + found = true; + break; + } + } + if (!found) { + distinct_output_dir_paths.emplace_back(curr_path); + distinct_output_dirs.emplace_back(data_dir); + } + + all_mutable_cf_options.emplace_back(*cfd->GetLatestMutableCFOptions()); + const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.back(); + const uint64_t* max_memtable_id = &(bg_flush_args[i].max_memtable_id_); + jobs.emplace_back(new FlushJob( + dbname_, cfd, immutable_db_options_, mutable_cf_options, + max_memtable_id, file_options_for_compaction_, versions_.get(), &mutex_, + &shutting_down_, snapshot_seqs, earliest_write_conflict_snapshot, + snapshot_checker, job_context, log_buffer, directories_.GetDbDir(), + data_dir, GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), + stats_, &event_logger_, mutable_cf_options.report_bg_io_stats, + false /* sync_output_directory */, false /* write_manifest */, + thread_pri)); + jobs.back()->PickMemTable(); + } + + std::vector file_meta(num_cfs); + Status s; + assert(num_cfs == static_cast(jobs.size())); + +#ifndef ROCKSDB_LITE + for (int i = 0; i != num_cfs; ++i) { + const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.at(i); + // may temporarily unlock and lock the mutex. + NotifyOnFlushBegin(cfds[i], &file_meta[i], mutable_cf_options, + job_context->job_id); + } +#endif /* !ROCKSDB_LITE */ + + if (logfile_number_ > 0) { + // TODO (yanqin) investigate whether we should sync the closed logs for + // single column family case. + s = SyncClosedLogs(job_context); + } + + // exec_status stores the execution status of flush_jobs as + // + autovector> exec_status; + for (int i = 0; i != num_cfs; ++i) { + // Initially all jobs are not executed, with status OK. + exec_status.emplace_back(false, Status::OK()); + } + + if (s.ok()) { + // TODO (yanqin): parallelize jobs with threads. + for (int i = 1; i != num_cfs; ++i) { + exec_status[i].second = + jobs[i]->Run(&logs_with_prep_tracker_, &file_meta[i]); + exec_status[i].first = true; + } + if (num_cfs > 1) { + TEST_SYNC_POINT( + "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:1"); + TEST_SYNC_POINT( + "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:2"); + } + assert(exec_status.size() > 0); + assert(!file_meta.empty()); + exec_status[0].second = + jobs[0]->Run(&logs_with_prep_tracker_, &file_meta[0]); + exec_status[0].first = true; + + Status error_status; + for (const auto& e : exec_status) { + if (!e.second.ok()) { + s = e.second; + if (!e.second.IsShutdownInProgress() && + !e.second.IsColumnFamilyDropped()) { + // If a flush job did not return OK, and the CF is not dropped, and + // the DB is not shutting down, then we have to return this result to + // caller later. + error_status = e.second; + } + } + } + + s = error_status.ok() ? s : error_status; + } + + if (s.IsColumnFamilyDropped()) { + s = Status::OK(); + } + + if (s.ok() || s.IsShutdownInProgress()) { + // Sync on all distinct output directories. + for (auto dir : distinct_output_dirs) { + if (dir != nullptr) { + Status error_status = dir->Fsync(); + if (!error_status.ok()) { + s = error_status; + break; + } + } + } + } else { + // Need to undo atomic flush if something went wrong, i.e. s is not OK and + // it is not because of CF drop. + // Have to cancel the flush jobs that have NOT executed because we need to + // unref the versions. + for (int i = 0; i != num_cfs; ++i) { + if (!exec_status[i].first) { + jobs[i]->Cancel(); + } + } + for (int i = 0; i != num_cfs; ++i) { + if (exec_status[i].first && exec_status[i].second.ok()) { + auto& mems = jobs[i]->GetMemTables(); + cfds[i]->imm()->RollbackMemtableFlush(mems, + file_meta[i].fd.GetNumber()); + } + } + } + + if (s.ok()) { + auto wait_to_install_func = [&]() { + bool ready = true; + for (size_t i = 0; i != cfds.size(); ++i) { + const auto& mems = jobs[i]->GetMemTables(); + if (cfds[i]->IsDropped()) { + // If the column family is dropped, then do not wait. + continue; + } else if (!mems.empty() && + cfds[i]->imm()->GetEarliestMemTableID() < mems[0]->GetID()) { + // If a flush job needs to install the flush result for mems and + // mems[0] is not the earliest memtable, it means another thread must + // be installing flush results for the same column family, then the + // current thread needs to wait. + ready = false; + break; + } else if (mems.empty() && cfds[i]->imm()->GetEarliestMemTableID() <= + bg_flush_args[i].max_memtable_id_) { + // If a flush job does not need to install flush results, then it has + // to wait until all memtables up to max_memtable_id_ (inclusive) are + // installed. + ready = false; + break; + } + } + return ready; + }; + + bool resuming_from_bg_err = error_handler_.IsDBStopped(); + while ((!error_handler_.IsDBStopped() || + error_handler_.GetRecoveryError().ok()) && + !wait_to_install_func()) { + atomic_flush_install_cv_.Wait(); + } + + s = resuming_from_bg_err ? error_handler_.GetRecoveryError() + : error_handler_.GetBGError(); + } + + if (s.ok()) { + autovector tmp_cfds; + autovector*> mems_list; + autovector mutable_cf_options_list; + autovector tmp_file_meta; + for (int i = 0; i != num_cfs; ++i) { + const auto& mems = jobs[i]->GetMemTables(); + if (!cfds[i]->IsDropped() && !mems.empty()) { + tmp_cfds.emplace_back(cfds[i]); + mems_list.emplace_back(&mems); + mutable_cf_options_list.emplace_back(&all_mutable_cf_options[i]); + tmp_file_meta.emplace_back(&file_meta[i]); + } + } + + s = InstallMemtableAtomicFlushResults( + nullptr /* imm_lists */, tmp_cfds, mutable_cf_options_list, mems_list, + versions_.get(), &mutex_, tmp_file_meta, + &job_context->memtables_to_free, directories_.GetDbDir(), log_buffer); + } + + if (s.ok()) { + assert(num_cfs == + static_cast(job_context->superversion_contexts.size())); + for (int i = 0; i != num_cfs; ++i) { + if (cfds[i]->IsDropped()) { + continue; + } + InstallSuperVersionAndScheduleWork(cfds[i], + &job_context->superversion_contexts[i], + all_mutable_cf_options[i]); + VersionStorageInfo::LevelSummaryStorage tmp; + ROCKS_LOG_BUFFER(log_buffer, "[%s] Level summary: %s\n", + cfds[i]->GetName().c_str(), + cfds[i]->current()->storage_info()->LevelSummary(&tmp)); + } + if (made_progress) { + *made_progress = true; + } +#ifndef ROCKSDB_LITE + auto sfm = static_cast( + immutable_db_options_.sst_file_manager.get()); + assert(all_mutable_cf_options.size() == static_cast(num_cfs)); + for (int i = 0; i != num_cfs; ++i) { + if (cfds[i]->IsDropped()) { + continue; + } + NotifyOnFlushCompleted(cfds[i], all_mutable_cf_options[i], + jobs[i]->GetCommittedFlushJobsInfo()); + if (sfm) { + std::string file_path = MakeTableFileName( + cfds[i]->ioptions()->cf_paths[0].path, file_meta[i].fd.GetNumber()); + sfm->OnAddFile(file_path); + if (sfm->IsMaxAllowedSpaceReached() && + error_handler_.GetBGError().ok()) { + Status new_bg_error = + Status::SpaceLimit("Max allowed space was reached"); + error_handler_.SetBGError(new_bg_error, + BackgroundErrorReason::kFlush); + } + } + } +#endif // ROCKSDB_LITE + } + + if (!s.ok() && !s.IsShutdownInProgress()) { + Status new_bg_error = s; + error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush); + } + + return s; +} + +void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta, + const MutableCFOptions& mutable_cf_options, + int job_id) { +#ifndef ROCKSDB_LITE + if (immutable_db_options_.listeners.size() == 0U) { + return; + } + mutex_.AssertHeld(); + if (shutting_down_.load(std::memory_order_acquire)) { + return; + } + bool triggered_writes_slowdown = + (cfd->current()->storage_info()->NumLevelFiles(0) >= + mutable_cf_options.level0_slowdown_writes_trigger); + bool triggered_writes_stop = + (cfd->current()->storage_info()->NumLevelFiles(0) >= + mutable_cf_options.level0_stop_writes_trigger); + // release lock while notifying events + mutex_.Unlock(); + { + FlushJobInfo info{}; + info.cf_id = cfd->GetID(); + info.cf_name = cfd->GetName(); + // TODO(yhchiang): make db_paths dynamic in case flush does not + // go to L0 in the future. + const uint64_t file_number = file_meta->fd.GetNumber(); + info.file_path = + MakeTableFileName(cfd->ioptions()->cf_paths[0].path, file_number); + info.file_number = file_number; + info.thread_id = env_->GetThreadID(); + info.job_id = job_id; + info.triggered_writes_slowdown = triggered_writes_slowdown; + info.triggered_writes_stop = triggered_writes_stop; + info.smallest_seqno = file_meta->fd.smallest_seqno; + info.largest_seqno = file_meta->fd.largest_seqno; + info.flush_reason = cfd->GetFlushReason(); + for (auto listener : immutable_db_options_.listeners) { + listener->OnFlushBegin(this, info); + } + } + mutex_.Lock(); +// no need to signal bg_cv_ as it will be signaled at the end of the +// flush process. +#else + (void)cfd; + (void)file_meta; + (void)mutable_cf_options; + (void)job_id; +#endif // ROCKSDB_LITE +} + +void DBImpl::NotifyOnFlushCompleted( + ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, + std::list>* flush_jobs_info) { +#ifndef ROCKSDB_LITE + assert(flush_jobs_info != nullptr); + if (immutable_db_options_.listeners.size() == 0U) { + return; + } + mutex_.AssertHeld(); + if (shutting_down_.load(std::memory_order_acquire)) { + return; + } + bool triggered_writes_slowdown = + (cfd->current()->storage_info()->NumLevelFiles(0) >= + mutable_cf_options.level0_slowdown_writes_trigger); + bool triggered_writes_stop = + (cfd->current()->storage_info()->NumLevelFiles(0) >= + mutable_cf_options.level0_stop_writes_trigger); + // release lock while notifying events + mutex_.Unlock(); + { + for (auto& info : *flush_jobs_info) { + info->triggered_writes_slowdown = triggered_writes_slowdown; + info->triggered_writes_stop = triggered_writes_stop; + for (auto listener : immutable_db_options_.listeners) { + listener->OnFlushCompleted(this, *info); + } + } + flush_jobs_info->clear(); + } + mutex_.Lock(); + // no need to signal bg_cv_ as it will be signaled at the end of the + // flush process. +#else + (void)cfd; + (void)mutable_cf_options; + (void)flush_jobs_info; +#endif // ROCKSDB_LITE +} + +Status DBImpl::CompactRange(const CompactRangeOptions& options, + ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end) { + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + + if (options.target_path_id >= cfd->ioptions()->cf_paths.size()) { + return Status::InvalidArgument("Invalid target path ID"); + } + + bool exclusive = options.exclusive_manual_compaction; + + bool flush_needed = true; + if (begin != nullptr && end != nullptr) { + // TODO(ajkr): We could also optimize away the flush in certain cases where + // one/both sides of the interval are unbounded. But it requires more + // changes to RangesOverlapWithMemtables. + Range range(*begin, *end); + SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); + cfd->RangesOverlapWithMemtables({range}, super_version, &flush_needed); + CleanupSuperVersion(super_version); + } + + Status s; + if (flush_needed) { + FlushOptions fo; + fo.allow_write_stall = options.allow_write_stall; + if (immutable_db_options_.atomic_flush) { + autovector cfds; + mutex_.Lock(); + SelectColumnFamiliesForAtomicFlush(&cfds); + mutex_.Unlock(); + s = AtomicFlushMemTables(cfds, fo, FlushReason::kManualCompaction, + false /* writes_stopped */); + } else { + s = FlushMemTable(cfd, fo, FlushReason::kManualCompaction, + false /* writes_stopped*/); + } + if (!s.ok()) { + LogFlush(immutable_db_options_.info_log); + return s; + } + } + + int max_level_with_files = 0; + // max_file_num_to_ignore can be used to filter out newly created SST files, + // useful for bottom level compaction in a manual compaction + uint64_t max_file_num_to_ignore = port::kMaxUint64; + uint64_t next_file_number = port::kMaxUint64; + { + InstrumentedMutexLock l(&mutex_); + Version* base = cfd->current(); + for (int level = 1; level < base->storage_info()->num_non_empty_levels(); + level++) { + if (base->storage_info()->OverlapInLevel(level, begin, end)) { + max_level_with_files = level; + } + } + next_file_number = versions_->current_next_file_number(); + } + + int final_output_level = 0; + + if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal && + cfd->NumberLevels() > 1) { + // Always compact all files together. + final_output_level = cfd->NumberLevels() - 1; + // if bottom most level is reserved + if (immutable_db_options_.allow_ingest_behind) { + final_output_level--; + } + s = RunManualCompaction(cfd, ColumnFamilyData::kCompactAllLevels, + final_output_level, options, begin, end, exclusive, + false, max_file_num_to_ignore); + } else { + for (int level = 0; level <= max_level_with_files; level++) { + int output_level; + // in case the compaction is universal or if we're compacting the + // bottom-most level, the output level will be the same as input one. + // level 0 can never be the bottommost level (i.e. if all files are in + // level 0, we will compact to level 1) + if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal || + cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { + output_level = level; + } else if (level == max_level_with_files && level > 0) { + if (options.bottommost_level_compaction == + BottommostLevelCompaction::kSkip) { + // Skip bottommost level compaction + continue; + } else if (options.bottommost_level_compaction == + BottommostLevelCompaction::kIfHaveCompactionFilter && + cfd->ioptions()->compaction_filter == nullptr && + cfd->ioptions()->compaction_filter_factory == nullptr) { + // Skip bottommost level compaction since we don't have a compaction + // filter + continue; + } + output_level = level; + // update max_file_num_to_ignore only for bottom level compaction + // because data in newly compacted files in middle levels may still need + // to be pushed down + max_file_num_to_ignore = next_file_number; + } else { + output_level = level + 1; + if (cfd->ioptions()->compaction_style == kCompactionStyleLevel && + cfd->ioptions()->level_compaction_dynamic_level_bytes && + level == 0) { + output_level = ColumnFamilyData::kCompactToBaseLevel; + } + } + s = RunManualCompaction(cfd, level, output_level, options, begin, end, + exclusive, false, max_file_num_to_ignore); + if (!s.ok()) { + break; + } + if (output_level == ColumnFamilyData::kCompactToBaseLevel) { + final_output_level = cfd->NumberLevels() - 1; + } else if (output_level > final_output_level) { + final_output_level = output_level; + } + TEST_SYNC_POINT("DBImpl::RunManualCompaction()::1"); + TEST_SYNC_POINT("DBImpl::RunManualCompaction()::2"); + } + } + if (!s.ok()) { + LogFlush(immutable_db_options_.info_log); + return s; + } + + if (options.change_level) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[RefitLevel] waiting for background threads to stop"); + s = PauseBackgroundWork(); + if (s.ok()) { + s = ReFitLevel(cfd, final_output_level, options.target_level); + } + ContinueBackgroundWork(); + } + LogFlush(immutable_db_options_.info_log); + + { + InstrumentedMutexLock l(&mutex_); + // an automatic compaction that has been scheduled might have been + // preempted by the manual compactions. Need to schedule it back. + MaybeScheduleFlushOrCompaction(); + } + + return s; +} + +Status DBImpl::CompactFiles(const CompactionOptions& compact_options, + ColumnFamilyHandle* column_family, + const std::vector& input_file_names, + const int output_level, const int output_path_id, + std::vector* const output_file_names, + CompactionJobInfo* compaction_job_info) { +#ifdef ROCKSDB_LITE + (void)compact_options; + (void)column_family; + (void)input_file_names; + (void)output_level; + (void)output_path_id; + (void)output_file_names; + (void)compaction_job_info; + // not supported in lite version + return Status::NotSupported("Not supported in ROCKSDB LITE"); +#else + if (column_family == nullptr) { + return Status::InvalidArgument("ColumnFamilyHandle must be non-null."); + } + + auto cfd = reinterpret_cast(column_family)->cfd(); + assert(cfd); + + Status s; + JobContext job_context(0, true); + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, + immutable_db_options_.info_log.get()); + + // Perform CompactFiles + TEST_SYNC_POINT("TestCompactFiles::IngestExternalFile2"); + { + InstrumentedMutexLock l(&mutex_); + + // This call will unlock/lock the mutex to wait for current running + // IngestExternalFile() calls to finish. + WaitForIngestFile(); + + // We need to get current after `WaitForIngestFile`, because + // `IngestExternalFile` may add files that overlap with `input_file_names` + auto* current = cfd->current(); + current->Ref(); + + s = CompactFilesImpl(compact_options, cfd, current, input_file_names, + output_file_names, output_level, output_path_id, + &job_context, &log_buffer, compaction_job_info); + + current->Unref(); + } + + // Find and delete obsolete files + { + InstrumentedMutexLock l(&mutex_); + // If !s.ok(), this means that Compaction failed. In that case, we want + // to delete all obsolete files we might have created and we force + // FindObsoleteFiles(). This is because job_context does not + // catch all created files if compaction failed. + FindObsoleteFiles(&job_context, !s.ok()); + } // release the mutex + + // delete unnecessary files if any, this is done outside the mutex + if (job_context.HaveSomethingToClean() || + job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) { + // Have to flush the info logs before bg_compaction_scheduled_-- + // because if bg_flush_scheduled_ becomes 0 and the lock is + // released, the deconstructor of DB can kick in and destroy all the + // states of DB so info_log might not be available after that point. + // It also applies to access other states that DB owns. + log_buffer.FlushBufferToLog(); + if (job_context.HaveSomethingToDelete()) { + // no mutex is locked here. No need to Unlock() and Lock() here. + PurgeObsoleteFiles(job_context); + } + job_context.Clean(); + } + + return s; +#endif // ROCKSDB_LITE +} + +#ifndef ROCKSDB_LITE +Status DBImpl::CompactFilesImpl( + const CompactionOptions& compact_options, ColumnFamilyData* cfd, + Version* version, const std::vector& input_file_names, + std::vector* const output_file_names, const int output_level, + int output_path_id, JobContext* job_context, LogBuffer* log_buffer, + CompactionJobInfo* compaction_job_info) { + mutex_.AssertHeld(); + + if (shutting_down_.load(std::memory_order_acquire)) { + return Status::ShutdownInProgress(); + } + if (manual_compaction_paused_.load(std::memory_order_acquire)) { + return Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } + + std::unordered_set input_set; + for (const auto& file_name : input_file_names) { + input_set.insert(TableFileNameToNumber(file_name)); + } + + ColumnFamilyMetaData cf_meta; + // TODO(yhchiang): can directly use version here if none of the + // following functions call is pluggable to external developers. + version->GetColumnFamilyMetaData(&cf_meta); + + if (output_path_id < 0) { + if (cfd->ioptions()->cf_paths.size() == 1U) { + output_path_id = 0; + } else { + return Status::NotSupported( + "Automatic output path selection is not " + "yet supported in CompactFiles()"); + } + } + + Status s = cfd->compaction_picker()->SanitizeCompactionInputFiles( + &input_set, cf_meta, output_level); + if (!s.ok()) { + return s; + } + + std::vector input_files; + s = cfd->compaction_picker()->GetCompactionInputsFromFileNumbers( + &input_files, &input_set, version->storage_info(), compact_options); + if (!s.ok()) { + return s; + } + + for (const auto& inputs : input_files) { + if (cfd->compaction_picker()->AreFilesInCompaction(inputs.files)) { + return Status::Aborted( + "Some of the necessary compaction input " + "files are already being compacted"); + } + } + bool sfm_reserved_compact_space = false; + // First check if we have enough room to do the compaction + bool enough_room = EnoughRoomForCompaction( + cfd, input_files, &sfm_reserved_compact_space, log_buffer); + + if (!enough_room) { + // m's vars will get set properly at the end of this function, + // as long as status == CompactionTooLarge + return Status::CompactionTooLarge(); + } + + // At this point, CompactFiles will be run. + bg_compaction_scheduled_++; + + std::unique_ptr c; + assert(cfd->compaction_picker()); + c.reset(cfd->compaction_picker()->CompactFiles( + compact_options, input_files, output_level, version->storage_info(), + *cfd->GetLatestMutableCFOptions(), output_path_id)); + // we already sanitized the set of input files and checked for conflicts + // without releasing the lock, so we're guaranteed a compaction can be formed. + assert(c != nullptr); + + c->SetInputVersion(version); + // deletion compaction currently not allowed in CompactFiles. + assert(!c->deletion_compaction()); + + std::vector snapshot_seqs; + SequenceNumber earliest_write_conflict_snapshot; + SnapshotChecker* snapshot_checker; + GetSnapshotContext(job_context, &snapshot_seqs, + &earliest_write_conflict_snapshot, &snapshot_checker); + + std::unique_ptr::iterator> pending_outputs_inserted_elem( + new std::list::iterator( + CaptureCurrentFileNumberInPendingOutputs())); + + assert(is_snapshot_supported_ || snapshots_.empty()); + CompactionJobStats compaction_job_stats; + CompactionJob compaction_job( + job_context->job_id, c.get(), immutable_db_options_, + file_options_for_compaction_, versions_.get(), &shutting_down_, + preserve_deletes_seqnum_.load(), log_buffer, directories_.GetDbDir(), + GetDataDir(c->column_family_data(), c->output_path_id()), stats_, &mutex_, + &error_handler_, snapshot_seqs, earliest_write_conflict_snapshot, + snapshot_checker, table_cache_, &event_logger_, + c->mutable_cf_options()->paranoid_file_checks, + c->mutable_cf_options()->report_bg_io_stats, dbname_, + &compaction_job_stats, Env::Priority::USER, &manual_compaction_paused_); + + // Creating a compaction influences the compaction score because the score + // takes running compactions into account (by skipping files that are already + // being compacted). Since we just changed compaction score, we recalculate it + // here. + version->storage_info()->ComputeCompactionScore(*cfd->ioptions(), + *c->mutable_cf_options()); + + compaction_job.Prepare(); + + mutex_.Unlock(); + TEST_SYNC_POINT("CompactFilesImpl:0"); + TEST_SYNC_POINT("CompactFilesImpl:1"); + compaction_job.Run(); + TEST_SYNC_POINT("CompactFilesImpl:2"); + TEST_SYNC_POINT("CompactFilesImpl:3"); + mutex_.Lock(); + + Status status = compaction_job.Install(*c->mutable_cf_options()); + if (status.ok()) { + InstallSuperVersionAndScheduleWork(c->column_family_data(), + &job_context->superversion_contexts[0], + *c->mutable_cf_options()); + } + c->ReleaseCompactionFiles(s); +#ifndef ROCKSDB_LITE + // Need to make sure SstFileManager does its bookkeeping + auto sfm = static_cast( + immutable_db_options_.sst_file_manager.get()); + if (sfm && sfm_reserved_compact_space) { + sfm->OnCompactionCompletion(c.get()); + } +#endif // ROCKSDB_LITE + + ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); + + if (compaction_job_info != nullptr) { + BuildCompactionJobInfo(cfd, c.get(), s, compaction_job_stats, + job_context->job_id, version, compaction_job_info); + } + + if (status.ok()) { + // Done + } else if (status.IsColumnFamilyDropped() || status.IsShutdownInProgress()) { + // Ignore compaction errors found during shutting down + } else if (status.IsManualCompactionPaused()) { + // Don't report stopping manual compaction as error + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[%s] [JOB %d] Stopping manual compaction", + c->column_family_data()->GetName().c_str(), + job_context->job_id); + } else { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "[%s] [JOB %d] Compaction error: %s", + c->column_family_data()->GetName().c_str(), + job_context->job_id, status.ToString().c_str()); + error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction); + } + + if (output_file_names != nullptr) { + for (const auto newf : c->edit()->GetNewFiles()) { + (*output_file_names) + .push_back(TableFileName(c->immutable_cf_options()->cf_paths, + newf.second.fd.GetNumber(), + newf.second.fd.GetPathId())); + } + } + + c.reset(); + + bg_compaction_scheduled_--; + if (bg_compaction_scheduled_ == 0) { + bg_cv_.SignalAll(); + } + MaybeScheduleFlushOrCompaction(); + TEST_SYNC_POINT("CompactFilesImpl:End"); + + return status; +} +#endif // ROCKSDB_LITE + +Status DBImpl::PauseBackgroundWork() { + InstrumentedMutexLock guard_lock(&mutex_); + bg_compaction_paused_++; + while (bg_bottom_compaction_scheduled_ > 0 || bg_compaction_scheduled_ > 0 || + bg_flush_scheduled_ > 0) { + bg_cv_.Wait(); + } + bg_work_paused_++; + return Status::OK(); +} + +Status DBImpl::ContinueBackgroundWork() { + InstrumentedMutexLock guard_lock(&mutex_); + if (bg_work_paused_ == 0) { + return Status::InvalidArgument(); + } + assert(bg_work_paused_ > 0); + assert(bg_compaction_paused_ > 0); + bg_compaction_paused_--; + bg_work_paused_--; + // It's sufficient to check just bg_work_paused_ here since + // bg_work_paused_ is always no greater than bg_compaction_paused_ + if (bg_work_paused_ == 0) { + MaybeScheduleFlushOrCompaction(); + } + return Status::OK(); +} + +void DBImpl::NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c, + const Status& st, + const CompactionJobStats& job_stats, + int job_id) { +#ifndef ROCKSDB_LITE + if (immutable_db_options_.listeners.empty()) { + return; + } + mutex_.AssertHeld(); + if (shutting_down_.load(std::memory_order_acquire)) { + return; + } + if (c->is_manual_compaction() && + manual_compaction_paused_.load(std::memory_order_acquire)) { + return; + } + Version* current = cfd->current(); + current->Ref(); + // release lock while notifying events + mutex_.Unlock(); + TEST_SYNC_POINT("DBImpl::NotifyOnCompactionBegin::UnlockMutex"); + { + CompactionJobInfo info{}; + info.cf_name = cfd->GetName(); + info.status = st; + info.thread_id = env_->GetThreadID(); + info.job_id = job_id; + info.base_input_level = c->start_level(); + info.output_level = c->output_level(); + info.stats = job_stats; + info.table_properties = c->GetOutputTableProperties(); + info.compaction_reason = c->compaction_reason(); + info.compression = c->output_compression(); + for (size_t i = 0; i < c->num_input_levels(); ++i) { + for (const auto fmd : *c->inputs(i)) { + const FileDescriptor& desc = fmd->fd; + const uint64_t file_number = desc.GetNumber(); + auto fn = TableFileName(c->immutable_cf_options()->cf_paths, + file_number, desc.GetPathId()); + info.input_files.push_back(fn); + info.input_file_infos.push_back(CompactionFileInfo{ + static_cast(i), file_number, fmd->oldest_blob_file_number}); + if (info.table_properties.count(fn) == 0) { + std::shared_ptr tp; + auto s = current->GetTableProperties(&tp, fmd, &fn); + if (s.ok()) { + info.table_properties[fn] = tp; + } + } + } + } + for (const auto newf : c->edit()->GetNewFiles()) { + const FileMetaData& meta = newf.second; + const FileDescriptor& desc = meta.fd; + const uint64_t file_number = desc.GetNumber(); + info.output_files.push_back(TableFileName( + c->immutable_cf_options()->cf_paths, file_number, desc.GetPathId())); + info.output_file_infos.push_back(CompactionFileInfo{ + newf.first, file_number, meta.oldest_blob_file_number}); + } + for (auto listener : immutable_db_options_.listeners) { + listener->OnCompactionBegin(this, info); + } + } + mutex_.Lock(); + current->Unref(); +#else + (void)cfd; + (void)c; + (void)st; + (void)job_stats; + (void)job_id; +#endif // ROCKSDB_LITE +} + +void DBImpl::NotifyOnCompactionCompleted( + ColumnFamilyData* cfd, Compaction* c, const Status& st, + const CompactionJobStats& compaction_job_stats, const int job_id) { +#ifndef ROCKSDB_LITE + if (immutable_db_options_.listeners.size() == 0U) { + return; + } + mutex_.AssertHeld(); + if (shutting_down_.load(std::memory_order_acquire)) { + return; + } + if (c->is_manual_compaction() && + manual_compaction_paused_.load(std::memory_order_acquire)) { + return; + } + Version* current = cfd->current(); + current->Ref(); + // release lock while notifying events + mutex_.Unlock(); + TEST_SYNC_POINT("DBImpl::NotifyOnCompactionCompleted::UnlockMutex"); + { + CompactionJobInfo info{}; + BuildCompactionJobInfo(cfd, c, st, compaction_job_stats, job_id, current, + &info); + for (auto listener : immutable_db_options_.listeners) { + listener->OnCompactionCompleted(this, info); + } + } + mutex_.Lock(); + current->Unref(); + // no need to signal bg_cv_ as it will be signaled at the end of the + // flush process. +#else + (void)cfd; + (void)c; + (void)st; + (void)compaction_job_stats; + (void)job_id; +#endif // ROCKSDB_LITE +} + +// REQUIREMENT: block all background work by calling PauseBackgroundWork() +// before calling this function +Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { + assert(level < cfd->NumberLevels()); + if (target_level >= cfd->NumberLevels()) { + return Status::InvalidArgument("Target level exceeds number of levels"); + } + + SuperVersionContext sv_context(/* create_superversion */ true); + + Status status; + + InstrumentedMutexLock guard_lock(&mutex_); + + // only allow one thread refitting + if (refitting_level_) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[ReFitLevel] another thread is refitting"); + return Status::NotSupported("another thread is refitting"); + } + refitting_level_ = true; + + const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions(); + // move to a smaller level + int to_level = target_level; + if (target_level < 0) { + to_level = FindMinimumEmptyLevelFitting(cfd, mutable_cf_options, level); + } + + auto* vstorage = cfd->current()->storage_info(); + if (to_level > level) { + if (level == 0) { + return Status::NotSupported( + "Cannot change from level 0 to other levels."); + } + // Check levels are empty for a trivial move + for (int l = level + 1; l <= to_level; l++) { + if (vstorage->NumLevelFiles(l) > 0) { + return Status::NotSupported( + "Levels between source and target are not empty for a move."); + } + } + } + if (to_level != level) { + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, + "[%s] Before refitting:\n%s", cfd->GetName().c_str(), + cfd->current()->DebugString().data()); + + VersionEdit edit; + edit.SetColumnFamily(cfd->GetID()); + for (const auto& f : vstorage->LevelFiles(level)) { + edit.DeleteFile(level, f->fd.GetNumber()); + edit.AddFile(to_level, f->fd.GetNumber(), f->fd.GetPathId(), + f->fd.GetFileSize(), f->smallest, f->largest, + f->fd.smallest_seqno, f->fd.largest_seqno, + f->marked_for_compaction, f->oldest_blob_file_number, + f->oldest_ancester_time, f->file_creation_time, + f->file_checksum, f->file_checksum_func_name); + } + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, + "[%s] Apply version edit:\n%s", cfd->GetName().c_str(), + edit.DebugString().data()); + + status = versions_->LogAndApply(cfd, mutable_cf_options, &edit, &mutex_, + directories_.GetDbDir()); + InstallSuperVersionAndScheduleWork(cfd, &sv_context, mutable_cf_options); + + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] LogAndApply: %s\n", + cfd->GetName().c_str(), status.ToString().data()); + + if (status.ok()) { + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, + "[%s] After refitting:\n%s", cfd->GetName().c_str(), + cfd->current()->DebugString().data()); + } + } + + sv_context.Clean(); + refitting_level_ = false; + + return status; +} + +int DBImpl::NumberLevels(ColumnFamilyHandle* column_family) { + auto cfh = reinterpret_cast(column_family); + return cfh->cfd()->NumberLevels(); +} + +int DBImpl::MaxMemCompactionLevel(ColumnFamilyHandle* /*column_family*/) { + return 0; +} + +int DBImpl::Level0StopWriteTrigger(ColumnFamilyHandle* column_family) { + auto cfh = reinterpret_cast(column_family); + InstrumentedMutexLock l(&mutex_); + return cfh->cfd() + ->GetSuperVersion() + ->mutable_cf_options.level0_stop_writes_trigger; +} + +Status DBImpl::Flush(const FlushOptions& flush_options, + ColumnFamilyHandle* column_family) { + auto cfh = reinterpret_cast(column_family); + ROCKS_LOG_INFO(immutable_db_options_.info_log, "[%s] Manual flush start.", + cfh->GetName().c_str()); + Status s; + if (immutable_db_options_.atomic_flush) { + s = AtomicFlushMemTables({cfh->cfd()}, flush_options, + FlushReason::kManualFlush); + } else { + s = FlushMemTable(cfh->cfd(), flush_options, FlushReason::kManualFlush); + } + + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[%s] Manual flush finished, status: %s\n", + cfh->GetName().c_str(), s.ToString().c_str()); + return s; +} + +Status DBImpl::Flush(const FlushOptions& flush_options, + const std::vector& column_families) { + Status s; + if (!immutable_db_options_.atomic_flush) { + for (auto cfh : column_families) { + s = Flush(flush_options, cfh); + if (!s.ok()) { + break; + } + } + } else { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Manual atomic flush start.\n" + "=====Column families:====="); + for (auto cfh : column_families) { + auto cfhi = static_cast(cfh); + ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s", + cfhi->GetName().c_str()); + } + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "=====End of column families list====="); + autovector cfds; + std::for_each(column_families.begin(), column_families.end(), + [&cfds](ColumnFamilyHandle* elem) { + auto cfh = static_cast(elem); + cfds.emplace_back(cfh->cfd()); + }); + s = AtomicFlushMemTables(cfds, flush_options, FlushReason::kManualFlush); + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Manual atomic flush finished, status: %s\n" + "=====Column families:=====", + s.ToString().c_str()); + for (auto cfh : column_families) { + auto cfhi = static_cast(cfh); + ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s", + cfhi->GetName().c_str()); + } + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "=====End of column families list====="); + } + return s; +} + +Status DBImpl::RunManualCompaction( + ColumnFamilyData* cfd, int input_level, int output_level, + const CompactRangeOptions& compact_range_options, const Slice* begin, + const Slice* end, bool exclusive, bool disallow_trivial_move, + uint64_t max_file_num_to_ignore) { + assert(input_level == ColumnFamilyData::kCompactAllLevels || + input_level >= 0); + + InternalKey begin_storage, end_storage; + CompactionArg* ca; + + bool scheduled = false; + bool manual_conflict = false; + ManualCompactionState manual; + manual.cfd = cfd; + manual.input_level = input_level; + manual.output_level = output_level; + manual.output_path_id = compact_range_options.target_path_id; + manual.done = false; + manual.in_progress = false; + manual.incomplete = false; + manual.exclusive = exclusive; + manual.disallow_trivial_move = disallow_trivial_move; + // For universal compaction, we enforce every manual compaction to compact + // all files. + if (begin == nullptr || + cfd->ioptions()->compaction_style == kCompactionStyleUniversal || + cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { + manual.begin = nullptr; + } else { + begin_storage.SetMinPossibleForUserKey(*begin); + manual.begin = &begin_storage; + } + if (end == nullptr || + cfd->ioptions()->compaction_style == kCompactionStyleUniversal || + cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { + manual.end = nullptr; + } else { + end_storage.SetMaxPossibleForUserKey(*end); + manual.end = &end_storage; + } + + TEST_SYNC_POINT("DBImpl::RunManualCompaction:0"); + TEST_SYNC_POINT("DBImpl::RunManualCompaction:1"); + InstrumentedMutexLock l(&mutex_); + + // When a manual compaction arrives, temporarily disable scheduling of + // non-manual compactions and wait until the number of scheduled compaction + // jobs drops to zero. This is needed to ensure that this manual compaction + // can compact any range of keys/files. + // + // HasPendingManualCompaction() is true when at least one thread is inside + // RunManualCompaction(), i.e. during that time no other compaction will + // get scheduled (see MaybeScheduleFlushOrCompaction). + // + // Note that the following loop doesn't stop more that one thread calling + // RunManualCompaction() from getting to the second while loop below. + // However, only one of them will actually schedule compaction, while + // others will wait on a condition variable until it completes. + + AddManualCompaction(&manual); + TEST_SYNC_POINT_CALLBACK("DBImpl::RunManualCompaction:NotScheduled", &mutex_); + if (exclusive) { + while (bg_bottom_compaction_scheduled_ > 0 || + bg_compaction_scheduled_ > 0) { + TEST_SYNC_POINT("DBImpl::RunManualCompaction:WaitScheduled"); + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "[%s] Manual compaction waiting for all other scheduled background " + "compactions to finish", + cfd->GetName().c_str()); + bg_cv_.Wait(); + } + } + + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[%s] Manual compaction starting", cfd->GetName().c_str()); + + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, + immutable_db_options_.info_log.get()); + // We don't check bg_error_ here, because if we get the error in compaction, + // the compaction will set manual.status to bg_error_ and set manual.done to + // true. + while (!manual.done) { + assert(HasPendingManualCompaction()); + manual_conflict = false; + Compaction* compaction = nullptr; + if (ShouldntRunManualCompaction(&manual) || (manual.in_progress == true) || + scheduled || + (((manual.manual_end = &manual.tmp_storage1) != nullptr) && + ((compaction = manual.cfd->CompactRange( + *manual.cfd->GetLatestMutableCFOptions(), manual.input_level, + manual.output_level, compact_range_options, manual.begin, + manual.end, &manual.manual_end, &manual_conflict, + max_file_num_to_ignore)) == nullptr && + manual_conflict))) { + // exclusive manual compactions should not see a conflict during + // CompactRange + assert(!exclusive || !manual_conflict); + // Running either this or some other manual compaction + bg_cv_.Wait(); + if (scheduled && manual.incomplete == true) { + assert(!manual.in_progress); + scheduled = false; + manual.incomplete = false; + } + } else if (!scheduled) { + if (compaction == nullptr) { + manual.done = true; + bg_cv_.SignalAll(); + continue; + } + ca = new CompactionArg; + ca->db = this; + ca->prepicked_compaction = new PrepickedCompaction; + ca->prepicked_compaction->manual_compaction_state = &manual; + ca->prepicked_compaction->compaction = compaction; + if (!RequestCompactionToken( + cfd, true, &ca->prepicked_compaction->task_token, &log_buffer)) { + // Don't throttle manual compaction, only count outstanding tasks. + assert(false); + } + manual.incomplete = false; + bg_compaction_scheduled_++; + env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, this, + &DBImpl::UnscheduleCompactionCallback); + scheduled = true; + } + } + + log_buffer.FlushBufferToLog(); + assert(!manual.in_progress); + assert(HasPendingManualCompaction()); + RemoveManualCompaction(&manual); + bg_cv_.SignalAll(); + return manual.status; +} + +void DBImpl::GenerateFlushRequest(const autovector& cfds, + FlushRequest* req) { + assert(req != nullptr); + req->reserve(cfds.size()); + for (const auto cfd : cfds) { + if (nullptr == cfd) { + // cfd may be null, see DBImpl::ScheduleFlushes + continue; + } + uint64_t max_memtable_id = cfd->imm()->GetLatestMemTableID(); + req->emplace_back(cfd, max_memtable_id); + } +} + +Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, + const FlushOptions& flush_options, + FlushReason flush_reason, bool writes_stopped) { + Status s; + uint64_t flush_memtable_id = 0; + if (!flush_options.allow_write_stall) { + bool flush_needed = true; + s = WaitUntilFlushWouldNotStallWrites(cfd, &flush_needed); + TEST_SYNC_POINT("DBImpl::FlushMemTable:StallWaitDone"); + if (!s.ok() || !flush_needed) { + return s; + } + } + FlushRequest flush_req; + { + WriteContext context; + InstrumentedMutexLock guard_lock(&mutex_); + + WriteThread::Writer w; + WriteThread::Writer nonmem_w; + if (!writes_stopped) { + write_thread_.EnterUnbatched(&w, &mutex_); + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + } + } + WaitForPendingWrites(); + + if (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load()) { + s = SwitchMemtable(cfd, &context); + } + if (s.ok()) { + if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() || + !cached_recoverable_state_empty_.load()) { + flush_memtable_id = cfd->imm()->GetLatestMemTableID(); + flush_req.emplace_back(cfd, flush_memtable_id); + } + if (immutable_db_options_.persist_stats_to_disk) { + ColumnFamilyData* cfd_stats = + versions_->GetColumnFamilySet()->GetColumnFamily( + kPersistentStatsColumnFamilyName); + if (cfd_stats != nullptr && cfd_stats != cfd && + !cfd_stats->mem()->IsEmpty()) { + // only force flush stats CF when it will be the only CF lagging + // behind after the current flush + bool stats_cf_flush_needed = true; + for (auto* loop_cfd : *versions_->GetColumnFamilySet()) { + if (loop_cfd == cfd_stats || loop_cfd == cfd) { + continue; + } + if (loop_cfd->GetLogNumber() <= cfd_stats->GetLogNumber()) { + stats_cf_flush_needed = false; + } + } + if (stats_cf_flush_needed) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Force flushing stats CF with manual flush of %s " + "to avoid holding old logs", + cfd->GetName().c_str()); + s = SwitchMemtable(cfd_stats, &context); + flush_memtable_id = cfd_stats->imm()->GetLatestMemTableID(); + flush_req.emplace_back(cfd_stats, flush_memtable_id); + } + } + } + } + + if (s.ok() && !flush_req.empty()) { + for (auto& elem : flush_req) { + ColumnFamilyData* loop_cfd = elem.first; + loop_cfd->imm()->FlushRequested(); + } + // If the caller wants to wait for this flush to complete, it indicates + // that the caller expects the ColumnFamilyData not to be free'ed by + // other threads which may drop the column family concurrently. + // Therefore, we increase the cfd's ref count. + if (flush_options.wait) { + for (auto& elem : flush_req) { + ColumnFamilyData* loop_cfd = elem.first; + loop_cfd->Ref(); + } + } + SchedulePendingFlush(flush_req, flush_reason); + MaybeScheduleFlushOrCompaction(); + } + + if (!writes_stopped) { + write_thread_.ExitUnbatched(&w); + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } + } + } + TEST_SYNC_POINT("DBImpl::FlushMemTable:AfterScheduleFlush"); + TEST_SYNC_POINT("DBImpl::FlushMemTable:BeforeWaitForBgFlush"); + if (s.ok() && flush_options.wait) { + autovector cfds; + autovector flush_memtable_ids; + for (auto& iter : flush_req) { + cfds.push_back(iter.first); + flush_memtable_ids.push_back(&(iter.second)); + } + s = WaitForFlushMemTables(cfds, flush_memtable_ids, + (flush_reason == FlushReason::kErrorRecovery)); + InstrumentedMutexLock lock_guard(&mutex_); + for (auto* tmp_cfd : cfds) { + tmp_cfd->UnrefAndTryDelete(); + } + } + TEST_SYNC_POINT("DBImpl::FlushMemTable:FlushMemTableFinished"); + return s; +} + +// Flush all elements in 'column_family_datas' +// and atomically record the result to the MANIFEST. +Status DBImpl::AtomicFlushMemTables( + const autovector& column_family_datas, + const FlushOptions& flush_options, FlushReason flush_reason, + bool writes_stopped) { + Status s; + if (!flush_options.allow_write_stall) { + int num_cfs_to_flush = 0; + for (auto cfd : column_family_datas) { + bool flush_needed = true; + s = WaitUntilFlushWouldNotStallWrites(cfd, &flush_needed); + if (!s.ok()) { + return s; + } else if (flush_needed) { + ++num_cfs_to_flush; + } + } + if (0 == num_cfs_to_flush) { + return s; + } + } + FlushRequest flush_req; + autovector cfds; + { + WriteContext context; + InstrumentedMutexLock guard_lock(&mutex_); + + WriteThread::Writer w; + WriteThread::Writer nonmem_w; + if (!writes_stopped) { + write_thread_.EnterUnbatched(&w, &mutex_); + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + } + } + WaitForPendingWrites(); + + for (auto cfd : column_family_datas) { + if (cfd->IsDropped()) { + continue; + } + if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() || + !cached_recoverable_state_empty_.load()) { + cfds.emplace_back(cfd); + } + } + for (auto cfd : cfds) { + if (cfd->mem()->IsEmpty() && cached_recoverable_state_empty_.load()) { + continue; + } + cfd->Ref(); + s = SwitchMemtable(cfd, &context); + cfd->UnrefAndTryDelete(); + if (!s.ok()) { + break; + } + } + if (s.ok()) { + AssignAtomicFlushSeq(cfds); + for (auto cfd : cfds) { + cfd->imm()->FlushRequested(); + } + // If the caller wants to wait for this flush to complete, it indicates + // that the caller expects the ColumnFamilyData not to be free'ed by + // other threads which may drop the column family concurrently. + // Therefore, we increase the cfd's ref count. + if (flush_options.wait) { + for (auto cfd : cfds) { + cfd->Ref(); + } + } + GenerateFlushRequest(cfds, &flush_req); + SchedulePendingFlush(flush_req, flush_reason); + MaybeScheduleFlushOrCompaction(); + } + + if (!writes_stopped) { + write_thread_.ExitUnbatched(&w); + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } + } + } + TEST_SYNC_POINT("DBImpl::AtomicFlushMemTables:AfterScheduleFlush"); + TEST_SYNC_POINT("DBImpl::AtomicFlushMemTables:BeforeWaitForBgFlush"); + if (s.ok() && flush_options.wait) { + autovector flush_memtable_ids; + for (auto& iter : flush_req) { + flush_memtable_ids.push_back(&(iter.second)); + } + s = WaitForFlushMemTables(cfds, flush_memtable_ids, + (flush_reason == FlushReason::kErrorRecovery)); + InstrumentedMutexLock lock_guard(&mutex_); + for (auto* cfd : cfds) { + cfd->UnrefAndTryDelete(); + } + } + return s; +} + +// Calling FlushMemTable(), whether from DB::Flush() or from Backup Engine, can +// cause write stall, for example if one memtable is being flushed already. +// This method tries to avoid write stall (similar to CompactRange() behavior) +// it emulates how the SuperVersion / LSM would change if flush happens, checks +// it against various constrains and delays flush if it'd cause write stall. +// Called should check status and flush_needed to see if flush already happened. +Status DBImpl::WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd, + bool* flush_needed) { + { + *flush_needed = true; + InstrumentedMutexLock l(&mutex_); + uint64_t orig_active_memtable_id = cfd->mem()->GetID(); + WriteStallCondition write_stall_condition = WriteStallCondition::kNormal; + do { + if (write_stall_condition != WriteStallCondition::kNormal) { + // Same error handling as user writes: Don't wait if there's a + // background error, even if it's a soft error. We might wait here + // indefinitely as the pending flushes/compactions may never finish + // successfully, resulting in the stall condition lasting indefinitely + if (error_handler_.IsBGWorkStopped()) { + return error_handler_.GetBGError(); + } + + TEST_SYNC_POINT("DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait"); + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[%s] WaitUntilFlushWouldNotStallWrites" + " waiting on stall conditions to clear", + cfd->GetName().c_str()); + bg_cv_.Wait(); + } + if (cfd->IsDropped()) { + return Status::ColumnFamilyDropped(); + } + if (shutting_down_.load(std::memory_order_acquire)) { + return Status::ShutdownInProgress(); + } + + uint64_t earliest_memtable_id = + std::min(cfd->mem()->GetID(), cfd->imm()->GetEarliestMemTableID()); + if (earliest_memtable_id > orig_active_memtable_id) { + // We waited so long that the memtable we were originally waiting on was + // flushed. + *flush_needed = false; + return Status::OK(); + } + + const auto& mutable_cf_options = *cfd->GetLatestMutableCFOptions(); + const auto* vstorage = cfd->current()->storage_info(); + + // Skip stalling check if we're below auto-flush and auto-compaction + // triggers. If it stalled in these conditions, that'd mean the stall + // triggers are so low that stalling is needed for any background work. In + // that case we shouldn't wait since background work won't be scheduled. + if (cfd->imm()->NumNotFlushed() < + cfd->ioptions()->min_write_buffer_number_to_merge && + vstorage->l0_delay_trigger_count() < + mutable_cf_options.level0_file_num_compaction_trigger) { + break; + } + + // check whether one extra immutable memtable or an extra L0 file would + // cause write stalling mode to be entered. It could still enter stall + // mode due to pending compaction bytes, but that's less common + write_stall_condition = + ColumnFamilyData::GetWriteStallConditionAndCause( + cfd->imm()->NumNotFlushed() + 1, + vstorage->l0_delay_trigger_count() + 1, + vstorage->estimated_compaction_needed_bytes(), mutable_cf_options) + .first; + } while (write_stall_condition != WriteStallCondition::kNormal); + } + return Status::OK(); +} + +// Wait for memtables to be flushed for multiple column families. +// let N = cfds.size() +// for i in [0, N), +// 1) if flush_memtable_ids[i] is not null, then the memtables with lower IDs +// have to be flushed for THIS column family; +// 2) if flush_memtable_ids[i] is null, then all memtables in THIS column +// family have to be flushed. +// Finish waiting when ALL column families finish flushing memtables. +// resuming_from_bg_err indicates whether the caller is trying to resume from +// background error or in normal processing. +Status DBImpl::WaitForFlushMemTables( + const autovector& cfds, + const autovector& flush_memtable_ids, + bool resuming_from_bg_err) { + int num = static_cast(cfds.size()); + // Wait until the compaction completes + InstrumentedMutexLock l(&mutex_); + // If the caller is trying to resume from bg error, then + // error_handler_.IsDBStopped() is true. + while (resuming_from_bg_err || !error_handler_.IsDBStopped()) { + if (shutting_down_.load(std::memory_order_acquire)) { + return Status::ShutdownInProgress(); + } + // If an error has occurred during resumption, then no need to wait. + if (!error_handler_.GetRecoveryError().ok()) { + break; + } + // Number of column families that have been dropped. + int num_dropped = 0; + // Number of column families that have finished flush. + int num_finished = 0; + for (int i = 0; i < num; ++i) { + if (cfds[i]->IsDropped()) { + ++num_dropped; + } else if (cfds[i]->imm()->NumNotFlushed() == 0 || + (flush_memtable_ids[i] != nullptr && + cfds[i]->imm()->GetEarliestMemTableID() > + *flush_memtable_ids[i])) { + ++num_finished; + } + } + if (1 == num_dropped && 1 == num) { + return Status::InvalidArgument("Cannot flush a dropped CF"); + } + // Column families involved in this flush request have either been dropped + // or finished flush. Then it's time to finish waiting. + if (num_dropped + num_finished == num) { + break; + } + bg_cv_.Wait(); + } + Status s; + // If not resuming from bg error, and an error has caused the DB to stop, + // then report the bg error to caller. + if (!resuming_from_bg_err && error_handler_.IsDBStopped()) { + s = error_handler_.GetBGError(); + } + return s; +} + +Status DBImpl::EnableAutoCompaction( + const std::vector& column_family_handles) { + Status s; + for (auto cf_ptr : column_family_handles) { + Status status = + this->SetOptions(cf_ptr, {{"disable_auto_compactions", "false"}}); + if (!status.ok()) { + s = status; + } + } + + return s; +} + +void DBImpl::DisableManualCompaction() { + manual_compaction_paused_.store(true, std::memory_order_release); +} + +void DBImpl::EnableManualCompaction() { + manual_compaction_paused_.store(false, std::memory_order_release); +} + +void DBImpl::MaybeScheduleFlushOrCompaction() { + mutex_.AssertHeld(); + if (!opened_successfully_) { + // Compaction may introduce data race to DB open + return; + } + if (bg_work_paused_ > 0) { + // we paused the background work + return; + } else if (error_handler_.IsBGWorkStopped() && + !error_handler_.IsRecoveryInProgress()) { + // There has been a hard error and this call is not part of the recovery + // sequence. Bail out here so we don't get into an endless loop of + // scheduling BG work which will again call this function + return; + } else if (shutting_down_.load(std::memory_order_acquire)) { + // DB is being deleted; no more background compactions + return; + } + auto bg_job_limits = GetBGJobLimits(); + bool is_flush_pool_empty = + env_->GetBackgroundThreads(Env::Priority::HIGH) == 0; + while (!is_flush_pool_empty && unscheduled_flushes_ > 0 && + bg_flush_scheduled_ < bg_job_limits.max_flushes) { + bg_flush_scheduled_++; + FlushThreadArg* fta = new FlushThreadArg; + fta->db_ = this; + fta->thread_pri_ = Env::Priority::HIGH; + env_->Schedule(&DBImpl::BGWorkFlush, fta, Env::Priority::HIGH, this, + &DBImpl::UnscheduleFlushCallback); + --unscheduled_flushes_; + TEST_SYNC_POINT_CALLBACK( + "DBImpl::MaybeScheduleFlushOrCompaction:AfterSchedule:0", + &unscheduled_flushes_); + } + + // special case -- if high-pri (flush) thread pool is empty, then schedule + // flushes in low-pri (compaction) thread pool. + if (is_flush_pool_empty) { + while (unscheduled_flushes_ > 0 && + bg_flush_scheduled_ + bg_compaction_scheduled_ < + bg_job_limits.max_flushes) { + bg_flush_scheduled_++; + FlushThreadArg* fta = new FlushThreadArg; + fta->db_ = this; + fta->thread_pri_ = Env::Priority::LOW; + env_->Schedule(&DBImpl::BGWorkFlush, fta, Env::Priority::LOW, this, + &DBImpl::UnscheduleFlushCallback); + --unscheduled_flushes_; + } + } + + if (bg_compaction_paused_ > 0) { + // we paused the background compaction + return; + } else if (error_handler_.IsBGWorkStopped()) { + // Compaction is not part of the recovery sequence from a hard error. We + // might get here because recovery might do a flush and install a new + // super version, which will try to schedule pending compactions. Bail + // out here and let the higher level recovery handle compactions + return; + } + + if (HasExclusiveManualCompaction()) { + // only manual compactions are allowed to run. don't schedule automatic + // compactions + TEST_SYNC_POINT("DBImpl::MaybeScheduleFlushOrCompaction:Conflict"); + return; + } + + while (bg_compaction_scheduled_ < bg_job_limits.max_compactions && + unscheduled_compactions_ > 0) { + CompactionArg* ca = new CompactionArg; + ca->db = this; + ca->prepicked_compaction = nullptr; + bg_compaction_scheduled_++; + unscheduled_compactions_--; + env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, this, + &DBImpl::UnscheduleCompactionCallback); + } +} + +DBImpl::BGJobLimits DBImpl::GetBGJobLimits() const { + mutex_.AssertHeld(); + return GetBGJobLimits(immutable_db_options_.max_background_flushes, + mutable_db_options_.max_background_compactions, + mutable_db_options_.max_background_jobs, + write_controller_.NeedSpeedupCompaction()); +} + +DBImpl::BGJobLimits DBImpl::GetBGJobLimits(int max_background_flushes, + int max_background_compactions, + int max_background_jobs, + bool parallelize_compactions) { + BGJobLimits res; + if (max_background_flushes == -1 && max_background_compactions == -1) { + // for our first stab implementing max_background_jobs, simply allocate a + // quarter of the threads to flushes. + res.max_flushes = std::max(1, max_background_jobs / 4); + res.max_compactions = std::max(1, max_background_jobs - res.max_flushes); + } else { + // compatibility code in case users haven't migrated to max_background_jobs, + // which automatically computes flush/compaction limits + res.max_flushes = std::max(1, max_background_flushes); + res.max_compactions = std::max(1, max_background_compactions); + } + if (!parallelize_compactions) { + // throttle background compactions until we deem necessary + res.max_compactions = 1; + } + return res; +} + +void DBImpl::AddToCompactionQueue(ColumnFamilyData* cfd) { + assert(!cfd->queued_for_compaction()); + cfd->Ref(); + compaction_queue_.push_back(cfd); + cfd->set_queued_for_compaction(true); +} + +ColumnFamilyData* DBImpl::PopFirstFromCompactionQueue() { + assert(!compaction_queue_.empty()); + auto cfd = *compaction_queue_.begin(); + compaction_queue_.pop_front(); + assert(cfd->queued_for_compaction()); + cfd->set_queued_for_compaction(false); + return cfd; +} + +DBImpl::FlushRequest DBImpl::PopFirstFromFlushQueue() { + assert(!flush_queue_.empty()); + FlushRequest flush_req = flush_queue_.front(); + flush_queue_.pop_front(); + // TODO: need to unset flush reason? + return flush_req; +} + +ColumnFamilyData* DBImpl::PickCompactionFromQueue( + std::unique_ptr* token, LogBuffer* log_buffer) { + assert(!compaction_queue_.empty()); + assert(*token == nullptr); + autovector throttled_candidates; + ColumnFamilyData* cfd = nullptr; + while (!compaction_queue_.empty()) { + auto first_cfd = *compaction_queue_.begin(); + compaction_queue_.pop_front(); + assert(first_cfd->queued_for_compaction()); + if (!RequestCompactionToken(first_cfd, false, token, log_buffer)) { + throttled_candidates.push_back(first_cfd); + continue; + } + cfd = first_cfd; + cfd->set_queued_for_compaction(false); + break; + } + // Add throttled compaction candidates back to queue in the original order. + for (auto iter = throttled_candidates.rbegin(); + iter != throttled_candidates.rend(); ++iter) { + compaction_queue_.push_front(*iter); + } + return cfd; +} + +void DBImpl::SchedulePendingFlush(const FlushRequest& flush_req, + FlushReason flush_reason) { + if (flush_req.empty()) { + return; + } + for (auto& iter : flush_req) { + ColumnFamilyData* cfd = iter.first; + cfd->Ref(); + cfd->SetFlushReason(flush_reason); + } + ++unscheduled_flushes_; + flush_queue_.push_back(flush_req); +} + +void DBImpl::SchedulePendingCompaction(ColumnFamilyData* cfd) { + if (!cfd->queued_for_compaction() && cfd->NeedsCompaction()) { + AddToCompactionQueue(cfd); + ++unscheduled_compactions_; + } +} + +void DBImpl::SchedulePendingPurge(std::string fname, std::string dir_to_sync, + FileType type, uint64_t number, int job_id) { + mutex_.AssertHeld(); + PurgeFileInfo file_info(fname, dir_to_sync, type, number, job_id); + purge_files_.insert({{number, std::move(file_info)}}); +} + +void DBImpl::BGWorkFlush(void* arg) { + FlushThreadArg fta = *(reinterpret_cast(arg)); + delete reinterpret_cast(arg); + + IOSTATS_SET_THREAD_POOL_ID(fta.thread_pri_); + TEST_SYNC_POINT("DBImpl::BGWorkFlush"); + static_cast_with_check(fta.db_)->BackgroundCallFlush( + fta.thread_pri_); + TEST_SYNC_POINT("DBImpl::BGWorkFlush:done"); +} + +void DBImpl::BGWorkCompaction(void* arg) { + CompactionArg ca = *(reinterpret_cast(arg)); + delete reinterpret_cast(arg); + IOSTATS_SET_THREAD_POOL_ID(Env::Priority::LOW); + TEST_SYNC_POINT("DBImpl::BGWorkCompaction"); + auto prepicked_compaction = + static_cast(ca.prepicked_compaction); + static_cast_with_check(ca.db)->BackgroundCallCompaction( + prepicked_compaction, Env::Priority::LOW); + delete prepicked_compaction; +} + +void DBImpl::BGWorkBottomCompaction(void* arg) { + CompactionArg ca = *(static_cast(arg)); + delete static_cast(arg); + IOSTATS_SET_THREAD_POOL_ID(Env::Priority::BOTTOM); + TEST_SYNC_POINT("DBImpl::BGWorkBottomCompaction"); + auto* prepicked_compaction = ca.prepicked_compaction; + assert(prepicked_compaction && prepicked_compaction->compaction && + !prepicked_compaction->manual_compaction_state); + ca.db->BackgroundCallCompaction(prepicked_compaction, Env::Priority::BOTTOM); + delete prepicked_compaction; +} + +void DBImpl::BGWorkPurge(void* db) { + IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH); + TEST_SYNC_POINT("DBImpl::BGWorkPurge:start"); + reinterpret_cast(db)->BackgroundCallPurge(); + TEST_SYNC_POINT("DBImpl::BGWorkPurge:end"); +} + +void DBImpl::UnscheduleCompactionCallback(void* arg) { + CompactionArg ca = *(reinterpret_cast(arg)); + delete reinterpret_cast(arg); + if (ca.prepicked_compaction != nullptr) { + if (ca.prepicked_compaction->compaction != nullptr) { + delete ca.prepicked_compaction->compaction; + } + delete ca.prepicked_compaction; + } + TEST_SYNC_POINT("DBImpl::UnscheduleCompactionCallback"); +} + +void DBImpl::UnscheduleFlushCallback(void* arg) { + delete reinterpret_cast(arg); + TEST_SYNC_POINT("DBImpl::UnscheduleFlushCallback"); +} + +Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context, + LogBuffer* log_buffer, FlushReason* reason, + Env::Priority thread_pri) { + mutex_.AssertHeld(); + + Status status; + *reason = FlushReason::kOthers; + // If BG work is stopped due to an error, but a recovery is in progress, + // that means this flush is part of the recovery. So allow it to go through + if (!error_handler_.IsBGWorkStopped()) { + if (shutting_down_.load(std::memory_order_acquire)) { + status = Status::ShutdownInProgress(); + } + } else if (!error_handler_.IsRecoveryInProgress()) { + status = error_handler_.GetBGError(); + } + + if (!status.ok()) { + return status; + } + + autovector bg_flush_args; + std::vector& superversion_contexts = + job_context->superversion_contexts; + autovector column_families_not_to_flush; + while (!flush_queue_.empty()) { + // This cfd is already referenced + const FlushRequest& flush_req = PopFirstFromFlushQueue(); + superversion_contexts.clear(); + superversion_contexts.reserve(flush_req.size()); + + for (const auto& iter : flush_req) { + ColumnFamilyData* cfd = iter.first; + if (cfd->IsDropped() || !cfd->imm()->IsFlushPending()) { + // can't flush this CF, try next one + column_families_not_to_flush.push_back(cfd); + continue; + } + superversion_contexts.emplace_back(SuperVersionContext(true)); + bg_flush_args.emplace_back(cfd, iter.second, + &(superversion_contexts.back())); + } + if (!bg_flush_args.empty()) { + break; + } + } + + if (!bg_flush_args.empty()) { + auto bg_job_limits = GetBGJobLimits(); + for (const auto& arg : bg_flush_args) { + ColumnFamilyData* cfd = arg.cfd_; + ROCKS_LOG_BUFFER( + log_buffer, + "Calling FlushMemTableToOutputFile with column " + "family [%s], flush slots available %d, compaction slots available " + "%d, " + "flush slots scheduled %d, compaction slots scheduled %d", + cfd->GetName().c_str(), bg_job_limits.max_flushes, + bg_job_limits.max_compactions, bg_flush_scheduled_, + bg_compaction_scheduled_); + } + status = FlushMemTablesToOutputFiles(bg_flush_args, made_progress, + job_context, log_buffer, thread_pri); + TEST_SYNC_POINT("DBImpl::BackgroundFlush:BeforeFlush"); + // All the CFDs in the FlushReq must have the same flush reason, so just + // grab the first one + *reason = bg_flush_args[0].cfd_->GetFlushReason(); + for (auto& arg : bg_flush_args) { + ColumnFamilyData* cfd = arg.cfd_; + if (cfd->UnrefAndTryDelete()) { + arg.cfd_ = nullptr; + } + } + } + for (auto cfd : column_families_not_to_flush) { + cfd->UnrefAndTryDelete(); + } + return status; +} + +void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) { + bool made_progress = false; + JobContext job_context(next_job_id_.fetch_add(1), true); + + TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:start"); + + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, + immutable_db_options_.info_log.get()); + { + InstrumentedMutexLock l(&mutex_); + assert(bg_flush_scheduled_); + num_running_flushes_++; + + std::unique_ptr::iterator> + pending_outputs_inserted_elem(new std::list::iterator( + CaptureCurrentFileNumberInPendingOutputs())); + FlushReason reason; + + Status s = BackgroundFlush(&made_progress, &job_context, &log_buffer, + &reason, thread_pri); + if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped() && + reason != FlushReason::kErrorRecovery) { + // Wait a little bit before retrying background flush in + // case this is an environmental problem and we do not want to + // chew up resources for failed flushes for the duration of + // the problem. + uint64_t error_cnt = + default_cf_internal_stats_->BumpAndGetBackgroundErrorCount(); + bg_cv_.SignalAll(); // In case a waiter can proceed despite the error + mutex_.Unlock(); + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Waiting after background flush error: %s" + "Accumulated background error counts: %" PRIu64, + s.ToString().c_str(), error_cnt); + log_buffer.FlushBufferToLog(); + LogFlush(immutable_db_options_.info_log); + env_->SleepForMicroseconds(1000000); + mutex_.Lock(); + } + + TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:FlushFinish:0"); + ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); + + // If flush failed, we want to delete all temporary files that we might have + // created. Thus, we force full scan in FindObsoleteFiles() + FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() && + !s.IsColumnFamilyDropped()); + // delete unnecessary files if any, this is done outside the mutex + if (job_context.HaveSomethingToClean() || + job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) { + mutex_.Unlock(); + TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:FilesFound"); + // Have to flush the info logs before bg_flush_scheduled_-- + // because if bg_flush_scheduled_ becomes 0 and the lock is + // released, the deconstructor of DB can kick in and destroy all the + // states of DB so info_log might not be available after that point. + // It also applies to access other states that DB owns. + log_buffer.FlushBufferToLog(); + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); + } + job_context.Clean(); + mutex_.Lock(); + } + TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:ContextCleanedUp"); + + assert(num_running_flushes_ > 0); + num_running_flushes_--; + bg_flush_scheduled_--; + // See if there's more work to be done + MaybeScheduleFlushOrCompaction(); + atomic_flush_install_cv_.SignalAll(); + bg_cv_.SignalAll(); + // IMPORTANT: there should be no code after calling SignalAll. This call may + // signal the DB destructor that it's OK to proceed with destruction. In + // that case, all DB variables will be dealloacated and referencing them + // will cause trouble. + } +} + +void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction, + Env::Priority bg_thread_pri) { + bool made_progress = false; + JobContext job_context(next_job_id_.fetch_add(1), true); + TEST_SYNC_POINT("BackgroundCallCompaction:0"); + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, + immutable_db_options_.info_log.get()); + { + InstrumentedMutexLock l(&mutex_); + + // This call will unlock/lock the mutex to wait for current running + // IngestExternalFile() calls to finish. + WaitForIngestFile(); + + num_running_compactions_++; + + std::unique_ptr::iterator> + pending_outputs_inserted_elem(new std::list::iterator( + CaptureCurrentFileNumberInPendingOutputs())); + + assert((bg_thread_pri == Env::Priority::BOTTOM && + bg_bottom_compaction_scheduled_) || + (bg_thread_pri == Env::Priority::LOW && bg_compaction_scheduled_)); + Status s = BackgroundCompaction(&made_progress, &job_context, &log_buffer, + prepicked_compaction, bg_thread_pri); + TEST_SYNC_POINT("BackgroundCallCompaction:1"); + if (s.IsBusy()) { + bg_cv_.SignalAll(); // In case a waiter can proceed despite the error + mutex_.Unlock(); + env_->SleepForMicroseconds(10000); // prevent hot loop + mutex_.Lock(); + } else if (!s.ok() && !s.IsShutdownInProgress() && + !s.IsManualCompactionPaused() && !s.IsColumnFamilyDropped()) { + // Wait a little bit before retrying background compaction in + // case this is an environmental problem and we do not want to + // chew up resources for failed compactions for the duration of + // the problem. + uint64_t error_cnt = + default_cf_internal_stats_->BumpAndGetBackgroundErrorCount(); + bg_cv_.SignalAll(); // In case a waiter can proceed despite the error + mutex_.Unlock(); + log_buffer.FlushBufferToLog(); + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Waiting after background compaction error: %s, " + "Accumulated background error counts: %" PRIu64, + s.ToString().c_str(), error_cnt); + LogFlush(immutable_db_options_.info_log); + env_->SleepForMicroseconds(1000000); + mutex_.Lock(); + } else if (s.IsManualCompactionPaused()) { + ManualCompactionState* m = prepicked_compaction->manual_compaction_state; + assert(m); + ROCKS_LOG_BUFFER(&log_buffer, "[%s] [JOB %d] Manual compaction paused", + m->cfd->GetName().c_str(), job_context.job_id); + } + + ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); + + // If compaction failed, we want to delete all temporary files that we might + // have created (they might not be all recorded in job_context in case of a + // failure). Thus, we force full scan in FindObsoleteFiles() + FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() && + !s.IsManualCompactionPaused() && + !s.IsColumnFamilyDropped()); + TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:FoundObsoleteFiles"); + + // delete unnecessary files if any, this is done outside the mutex + if (job_context.HaveSomethingToClean() || + job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) { + mutex_.Unlock(); + // Have to flush the info logs before bg_compaction_scheduled_-- + // because if bg_flush_scheduled_ becomes 0 and the lock is + // released, the deconstructor of DB can kick in and destroy all the + // states of DB so info_log might not be available after that point. + // It also applies to access other states that DB owns. + log_buffer.FlushBufferToLog(); + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); + TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles"); + } + job_context.Clean(); + mutex_.Lock(); + } + + assert(num_running_compactions_ > 0); + num_running_compactions_--; + if (bg_thread_pri == Env::Priority::LOW) { + bg_compaction_scheduled_--; + } else { + assert(bg_thread_pri == Env::Priority::BOTTOM); + bg_bottom_compaction_scheduled_--; + } + + versions_->GetColumnFamilySet()->FreeDeadColumnFamilies(); + + // See if there's more work to be done + MaybeScheduleFlushOrCompaction(); + if (made_progress || + (bg_compaction_scheduled_ == 0 && + bg_bottom_compaction_scheduled_ == 0) || + HasPendingManualCompaction() || unscheduled_compactions_ == 0) { + // signal if + // * made_progress -- need to wakeup DelayWrite + // * bg_{bottom,}_compaction_scheduled_ == 0 -- need to wakeup ~DBImpl + // * HasPendingManualCompaction -- need to wakeup RunManualCompaction + // If none of this is true, there is no need to signal since nobody is + // waiting for it + bg_cv_.SignalAll(); + } + // IMPORTANT: there should be no code after calling SignalAll. This call may + // signal the DB destructor that it's OK to proceed with destruction. In + // that case, all DB variables will be dealloacated and referencing them + // will cause trouble. + } +} + +Status DBImpl::BackgroundCompaction(bool* made_progress, + JobContext* job_context, + LogBuffer* log_buffer, + PrepickedCompaction* prepicked_compaction, + Env::Priority thread_pri) { + ManualCompactionState* manual_compaction = + prepicked_compaction == nullptr + ? nullptr + : prepicked_compaction->manual_compaction_state; + *made_progress = false; + mutex_.AssertHeld(); + TEST_SYNC_POINT("DBImpl::BackgroundCompaction:Start"); + + bool is_manual = (manual_compaction != nullptr); + std::unique_ptr c; + if (prepicked_compaction != nullptr && + prepicked_compaction->compaction != nullptr) { + c.reset(prepicked_compaction->compaction); + } + bool is_prepicked = is_manual || c; + + // (manual_compaction->in_progress == false); + bool trivial_move_disallowed = + is_manual && manual_compaction->disallow_trivial_move; + + CompactionJobStats compaction_job_stats; + Status status; + if (!error_handler_.IsBGWorkStopped()) { + if (shutting_down_.load(std::memory_order_acquire)) { + status = Status::ShutdownInProgress(); + } else if (is_manual && + manual_compaction_paused_.load(std::memory_order_acquire)) { + status = Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } + } else { + status = error_handler_.GetBGError(); + // If we get here, it means a hard error happened after this compaction + // was scheduled by MaybeScheduleFlushOrCompaction(), but before it got + // a chance to execute. Since we didn't pop a cfd from the compaction + // queue, increment unscheduled_compactions_ + unscheduled_compactions_++; + } + + if (!status.ok()) { + if (is_manual) { + manual_compaction->status = status; + manual_compaction->done = true; + manual_compaction->in_progress = false; + manual_compaction = nullptr; + } + if (c) { + c->ReleaseCompactionFiles(status); + c.reset(); + } + return status; + } + + if (is_manual) { + // another thread cannot pick up the same work + manual_compaction->in_progress = true; + } + + std::unique_ptr task_token; + + // InternalKey manual_end_storage; + // InternalKey* manual_end = &manual_end_storage; + bool sfm_reserved_compact_space = false; + if (is_manual) { + ManualCompactionState* m = manual_compaction; + assert(m->in_progress); + if (!c) { + m->done = true; + m->manual_end = nullptr; + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Manual compaction from level-%d from %s .. " + "%s; nothing to do\n", + m->cfd->GetName().c_str(), m->input_level, + (m->begin ? m->begin->DebugString().c_str() : "(begin)"), + (m->end ? m->end->DebugString().c_str() : "(end)")); + } else { + // First check if we have enough room to do the compaction + bool enough_room = EnoughRoomForCompaction( + m->cfd, *(c->inputs()), &sfm_reserved_compact_space, log_buffer); + + if (!enough_room) { + // Then don't do the compaction + c->ReleaseCompactionFiles(status); + c.reset(); + // m's vars will get set properly at the end of this function, + // as long as status == CompactionTooLarge + status = Status::CompactionTooLarge(); + } else { + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] Manual compaction from level-%d to level-%d from %s .. " + "%s; will stop at %s\n", + m->cfd->GetName().c_str(), m->input_level, c->output_level(), + (m->begin ? m->begin->DebugString().c_str() : "(begin)"), + (m->end ? m->end->DebugString().c_str() : "(end)"), + ((m->done || m->manual_end == nullptr) + ? "(end)" + : m->manual_end->DebugString().c_str())); + } + } + } else if (!is_prepicked && !compaction_queue_.empty()) { + if (HasExclusiveManualCompaction()) { + // Can't compact right now, but try again later + TEST_SYNC_POINT("DBImpl::BackgroundCompaction()::Conflict"); + + // Stay in the compaction queue. + unscheduled_compactions_++; + + return Status::OK(); + } + + auto cfd = PickCompactionFromQueue(&task_token, log_buffer); + if (cfd == nullptr) { + // Can't find any executable task from the compaction queue. + // All tasks have been throttled by compaction thread limiter. + ++unscheduled_compactions_; + return Status::Busy(); + } + + // We unreference here because the following code will take a Ref() on + // this cfd if it is going to use it (Compaction class holds a + // reference). + // This will all happen under a mutex so we don't have to be afraid of + // somebody else deleting it. + if (cfd->UnrefAndTryDelete()) { + // This was the last reference of the column family, so no need to + // compact. + return Status::OK(); + } + + // Pick up latest mutable CF Options and use it throughout the + // compaction job + // Compaction makes a copy of the latest MutableCFOptions. It should be used + // throughout the compaction procedure to make sure consistency. It will + // eventually be installed into SuperVersion + auto* mutable_cf_options = cfd->GetLatestMutableCFOptions(); + if (!mutable_cf_options->disable_auto_compactions && !cfd->IsDropped()) { + // NOTE: try to avoid unnecessary copy of MutableCFOptions if + // compaction is not necessary. Need to make sure mutex is held + // until we make a copy in the following code + TEST_SYNC_POINT("DBImpl::BackgroundCompaction():BeforePickCompaction"); + c.reset(cfd->PickCompaction(*mutable_cf_options, log_buffer)); + TEST_SYNC_POINT("DBImpl::BackgroundCompaction():AfterPickCompaction"); + + if (c != nullptr) { + bool enough_room = EnoughRoomForCompaction( + cfd, *(c->inputs()), &sfm_reserved_compact_space, log_buffer); + + if (!enough_room) { + // Then don't do the compaction + c->ReleaseCompactionFiles(status); + c->column_family_data() + ->current() + ->storage_info() + ->ComputeCompactionScore(*(c->immutable_cf_options()), + *(c->mutable_cf_options())); + AddToCompactionQueue(cfd); + ++unscheduled_compactions_; + + c.reset(); + // Don't need to sleep here, because BackgroundCallCompaction + // will sleep if !s.ok() + status = Status::CompactionTooLarge(); + } else { + // update statistics + RecordInHistogram(stats_, NUM_FILES_IN_SINGLE_COMPACTION, + c->inputs(0)->size()); + // There are three things that can change compaction score: + // 1) When flush or compaction finish. This case is covered by + // InstallSuperVersionAndScheduleWork + // 2) When MutableCFOptions changes. This case is also covered by + // InstallSuperVersionAndScheduleWork, because this is when the new + // options take effect. + // 3) When we Pick a new compaction, we "remove" those files being + // compacted from the calculation, which then influences compaction + // score. Here we check if we need the new compaction even without the + // files that are currently being compacted. If we need another + // compaction, we might be able to execute it in parallel, so we add + // it to the queue and schedule a new thread. + if (cfd->NeedsCompaction()) { + // Yes, we need more compactions! + AddToCompactionQueue(cfd); + ++unscheduled_compactions_; + MaybeScheduleFlushOrCompaction(); + } + } + } + } + } + + if (!c) { + // Nothing to do + ROCKS_LOG_BUFFER(log_buffer, "Compaction nothing to do"); + } else if (c->deletion_compaction()) { + // TODO(icanadi) Do we want to honor snapshots here? i.e. not delete old + // file if there is alive snapshot pointing to it + TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction", + c->column_family_data()); + assert(c->num_input_files(1) == 0); + assert(c->level() == 0); + assert(c->column_family_data()->ioptions()->compaction_style == + kCompactionStyleFIFO); + + compaction_job_stats.num_input_files = c->num_input_files(0); + + NotifyOnCompactionBegin(c->column_family_data(), c.get(), status, + compaction_job_stats, job_context->job_id); + + for (const auto& f : *c->inputs(0)) { + c->edit()->DeleteFile(c->level(), f->fd.GetNumber()); + } + status = versions_->LogAndApply(c->column_family_data(), + *c->mutable_cf_options(), c->edit(), + &mutex_, directories_.GetDbDir()); + InstallSuperVersionAndScheduleWork(c->column_family_data(), + &job_context->superversion_contexts[0], + *c->mutable_cf_options()); + ROCKS_LOG_BUFFER(log_buffer, "[%s] Deleted %d files\n", + c->column_family_data()->GetName().c_str(), + c->num_input_files(0)); + *made_progress = true; + TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction", + c->column_family_data()); + } else if (!trivial_move_disallowed && c->IsTrivialMove()) { + TEST_SYNC_POINT("DBImpl::BackgroundCompaction:TrivialMove"); + TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction", + c->column_family_data()); + // Instrument for event update + // TODO(yhchiang): add op details for showing trivial-move. + ThreadStatusUtil::SetColumnFamily( + c->column_family_data(), c->column_family_data()->ioptions()->env, + immutable_db_options_.enable_thread_tracking); + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION); + + compaction_job_stats.num_input_files = c->num_input_files(0); + + NotifyOnCompactionBegin(c->column_family_data(), c.get(), status, + compaction_job_stats, job_context->job_id); + + // Move files to next level + int32_t moved_files = 0; + int64_t moved_bytes = 0; + for (unsigned int l = 0; l < c->num_input_levels(); l++) { + if (c->level(l) == c->output_level()) { + continue; + } + for (size_t i = 0; i < c->num_input_files(l); i++) { + FileMetaData* f = c->input(l, i); + c->edit()->DeleteFile(c->level(l), f->fd.GetNumber()); + c->edit()->AddFile(c->output_level(), f->fd.GetNumber(), + f->fd.GetPathId(), f->fd.GetFileSize(), f->smallest, + f->largest, f->fd.smallest_seqno, + f->fd.largest_seqno, f->marked_for_compaction, + f->oldest_blob_file_number, f->oldest_ancester_time, + f->file_creation_time, f->file_checksum, + f->file_checksum_func_name); + + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] Moving #%" PRIu64 " to level-%d %" PRIu64 " bytes\n", + c->column_family_data()->GetName().c_str(), f->fd.GetNumber(), + c->output_level(), f->fd.GetFileSize()); + ++moved_files; + moved_bytes += f->fd.GetFileSize(); + } + } + + status = versions_->LogAndApply(c->column_family_data(), + *c->mutable_cf_options(), c->edit(), + &mutex_, directories_.GetDbDir()); + // Use latest MutableCFOptions + InstallSuperVersionAndScheduleWork(c->column_family_data(), + &job_context->superversion_contexts[0], + *c->mutable_cf_options()); + + VersionStorageInfo::LevelSummaryStorage tmp; + c->column_family_data()->internal_stats()->IncBytesMoved(c->output_level(), + moved_bytes); + { + event_logger_.LogToBuffer(log_buffer) + << "job" << job_context->job_id << "event" + << "trivial_move" + << "destination_level" << c->output_level() << "files" << moved_files + << "total_files_size" << moved_bytes; + } + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] Moved #%d files to level-%d %" PRIu64 " bytes %s: %s\n", + c->column_family_data()->GetName().c_str(), moved_files, + c->output_level(), moved_bytes, status.ToString().c_str(), + c->column_family_data()->current()->storage_info()->LevelSummary(&tmp)); + *made_progress = true; + + // Clear Instrument + ThreadStatusUtil::ResetThreadStatus(); + TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction", + c->column_family_data()); + } else if (!is_prepicked && c->output_level() > 0 && + c->output_level() == + c->column_family_data() + ->current() + ->storage_info() + ->MaxOutputLevel( + immutable_db_options_.allow_ingest_behind) && + env_->GetBackgroundThreads(Env::Priority::BOTTOM) > 0) { + // Forward compactions involving last level to the bottom pool if it exists, + // such that compactions unlikely to contribute to write stalls can be + // delayed or deprioritized. + TEST_SYNC_POINT("DBImpl::BackgroundCompaction:ForwardToBottomPriPool"); + CompactionArg* ca = new CompactionArg; + ca->db = this; + ca->prepicked_compaction = new PrepickedCompaction; + ca->prepicked_compaction->compaction = c.release(); + ca->prepicked_compaction->manual_compaction_state = nullptr; + // Transfer requested token, so it doesn't need to do it again. + ca->prepicked_compaction->task_token = std::move(task_token); + ++bg_bottom_compaction_scheduled_; + env_->Schedule(&DBImpl::BGWorkBottomCompaction, ca, Env::Priority::BOTTOM, + this, &DBImpl::UnscheduleCompactionCallback); + } else { + TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction", + c->column_family_data()); + int output_level __attribute__((__unused__)); + output_level = c->output_level(); + TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:NonTrivial", + &output_level); + std::vector snapshot_seqs; + SequenceNumber earliest_write_conflict_snapshot; + SnapshotChecker* snapshot_checker; + GetSnapshotContext(job_context, &snapshot_seqs, + &earliest_write_conflict_snapshot, &snapshot_checker); + assert(is_snapshot_supported_ || snapshots_.empty()); + CompactionJob compaction_job( + job_context->job_id, c.get(), immutable_db_options_, + file_options_for_compaction_, versions_.get(), &shutting_down_, + preserve_deletes_seqnum_.load(), log_buffer, directories_.GetDbDir(), + GetDataDir(c->column_family_data(), c->output_path_id()), stats_, + &mutex_, &error_handler_, snapshot_seqs, + earliest_write_conflict_snapshot, snapshot_checker, table_cache_, + &event_logger_, c->mutable_cf_options()->paranoid_file_checks, + c->mutable_cf_options()->report_bg_io_stats, dbname_, + &compaction_job_stats, thread_pri, + is_manual ? &manual_compaction_paused_ : nullptr); + compaction_job.Prepare(); + + NotifyOnCompactionBegin(c->column_family_data(), c.get(), status, + compaction_job_stats, job_context->job_id); + + mutex_.Unlock(); + TEST_SYNC_POINT_CALLBACK( + "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", nullptr); + compaction_job.Run(); + TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun"); + mutex_.Lock(); + + status = compaction_job.Install(*c->mutable_cf_options()); + if (status.ok()) { + InstallSuperVersionAndScheduleWork(c->column_family_data(), + &job_context->superversion_contexts[0], + *c->mutable_cf_options()); + } + *made_progress = true; + TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction", + c->column_family_data()); + } + if (c != nullptr) { + c->ReleaseCompactionFiles(status); + *made_progress = true; + +#ifndef ROCKSDB_LITE + // Need to make sure SstFileManager does its bookkeeping + auto sfm = static_cast( + immutable_db_options_.sst_file_manager.get()); + if (sfm && sfm_reserved_compact_space) { + sfm->OnCompactionCompletion(c.get()); + } +#endif // ROCKSDB_LITE + + NotifyOnCompactionCompleted(c->column_family_data(), c.get(), status, + compaction_job_stats, job_context->job_id); + } + + if (status.ok() || status.IsCompactionTooLarge() || + status.IsManualCompactionPaused()) { + // Done + } else if (status.IsColumnFamilyDropped() || status.IsShutdownInProgress()) { + // Ignore compaction errors found during shutting down + } else { + ROCKS_LOG_WARN(immutable_db_options_.info_log, "Compaction error: %s", + status.ToString().c_str()); + error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction); + if (c != nullptr && !is_manual && !error_handler_.IsBGWorkStopped()) { + // Put this cfd back in the compaction queue so we can retry after some + // time + auto cfd = c->column_family_data(); + assert(cfd != nullptr); + // Since this compaction failed, we need to recompute the score so it + // takes the original input files into account + c->column_family_data() + ->current() + ->storage_info() + ->ComputeCompactionScore(*(c->immutable_cf_options()), + *(c->mutable_cf_options())); + if (!cfd->queued_for_compaction()) { + AddToCompactionQueue(cfd); + ++unscheduled_compactions_; + } + } + } + // this will unref its input_version and column_family_data + c.reset(); + + if (is_manual) { + ManualCompactionState* m = manual_compaction; + if (!status.ok()) { + m->status = status; + m->done = true; + } + // For universal compaction: + // Because universal compaction always happens at level 0, so one + // compaction will pick up all overlapped files. No files will be + // filtered out due to size limit and left for a successive compaction. + // So we can safely conclude the current compaction. + // + // Also note that, if we don't stop here, then the current compaction + // writes a new file back to level 0, which will be used in successive + // compaction. Hence the manual compaction will never finish. + // + // Stop the compaction if manual_end points to nullptr -- this means + // that we compacted the whole range. manual_end should always point + // to nullptr in case of universal compaction + if (m->manual_end == nullptr) { + m->done = true; + } + if (!m->done) { + // We only compacted part of the requested range. Update *m + // to the range that is left to be compacted. + // Universal and FIFO compactions should always compact the whole range + assert(m->cfd->ioptions()->compaction_style != + kCompactionStyleUniversal || + m->cfd->ioptions()->num_levels > 1); + assert(m->cfd->ioptions()->compaction_style != kCompactionStyleFIFO); + m->tmp_storage = *m->manual_end; + m->begin = &m->tmp_storage; + m->incomplete = true; + } + m->in_progress = false; // not being processed anymore + } + TEST_SYNC_POINT("DBImpl::BackgroundCompaction:Finish"); + return status; +} + +bool DBImpl::HasPendingManualCompaction() { + return (!manual_compaction_dequeue_.empty()); +} + +void DBImpl::AddManualCompaction(DBImpl::ManualCompactionState* m) { + manual_compaction_dequeue_.push_back(m); +} + +void DBImpl::RemoveManualCompaction(DBImpl::ManualCompactionState* m) { + // Remove from queue + std::deque::iterator it = + manual_compaction_dequeue_.begin(); + while (it != manual_compaction_dequeue_.end()) { + if (m == (*it)) { + it = manual_compaction_dequeue_.erase(it); + return; + } + ++it; + } + assert(false); + return; +} + +bool DBImpl::ShouldntRunManualCompaction(ManualCompactionState* m) { + if (num_running_ingest_file_ > 0) { + // We need to wait for other IngestExternalFile() calls to finish + // before running a manual compaction. + return true; + } + if (m->exclusive) { + return (bg_bottom_compaction_scheduled_ > 0 || + bg_compaction_scheduled_ > 0); + } + std::deque::iterator it = + manual_compaction_dequeue_.begin(); + bool seen = false; + while (it != manual_compaction_dequeue_.end()) { + if (m == (*it)) { + ++it; + seen = true; + continue; + } else if (MCOverlap(m, (*it)) && (!seen && !(*it)->in_progress)) { + // Consider the other manual compaction *it, conflicts if: + // overlaps with m + // and (*it) is ahead in the queue and is not yet in progress + return true; + } + ++it; + } + return false; +} + +bool DBImpl::HaveManualCompaction(ColumnFamilyData* cfd) { + // Remove from priority queue + std::deque::iterator it = + manual_compaction_dequeue_.begin(); + while (it != manual_compaction_dequeue_.end()) { + if ((*it)->exclusive) { + return true; + } + if ((cfd == (*it)->cfd) && (!((*it)->in_progress || (*it)->done))) { + // Allow automatic compaction if manual compaction is + // in progress + return true; + } + ++it; + } + return false; +} + +bool DBImpl::HasExclusiveManualCompaction() { + // Remove from priority queue + std::deque::iterator it = + manual_compaction_dequeue_.begin(); + while (it != manual_compaction_dequeue_.end()) { + if ((*it)->exclusive) { + return true; + } + ++it; + } + return false; +} + +bool DBImpl::MCOverlap(ManualCompactionState* m, ManualCompactionState* m1) { + if ((m->exclusive) || (m1->exclusive)) { + return true; + } + if (m->cfd != m1->cfd) { + return false; + } + return true; +} + +#ifndef ROCKSDB_LITE +void DBImpl::BuildCompactionJobInfo( + const ColumnFamilyData* cfd, Compaction* c, const Status& st, + const CompactionJobStats& compaction_job_stats, const int job_id, + const Version* current, CompactionJobInfo* compaction_job_info) const { + assert(compaction_job_info != nullptr); + compaction_job_info->cf_id = cfd->GetID(); + compaction_job_info->cf_name = cfd->GetName(); + compaction_job_info->status = st; + compaction_job_info->thread_id = env_->GetThreadID(); + compaction_job_info->job_id = job_id; + compaction_job_info->base_input_level = c->start_level(); + compaction_job_info->output_level = c->output_level(); + compaction_job_info->stats = compaction_job_stats; + compaction_job_info->table_properties = c->GetOutputTableProperties(); + compaction_job_info->compaction_reason = c->compaction_reason(); + compaction_job_info->compression = c->output_compression(); + for (size_t i = 0; i < c->num_input_levels(); ++i) { + for (const auto fmd : *c->inputs(i)) { + const FileDescriptor& desc = fmd->fd; + const uint64_t file_number = desc.GetNumber(); + auto fn = TableFileName(c->immutable_cf_options()->cf_paths, file_number, + desc.GetPathId()); + compaction_job_info->input_files.push_back(fn); + compaction_job_info->input_file_infos.push_back(CompactionFileInfo{ + static_cast(i), file_number, fmd->oldest_blob_file_number}); + if (compaction_job_info->table_properties.count(fn) == 0) { + std::shared_ptr tp; + auto s = current->GetTableProperties(&tp, fmd, &fn); + if (s.ok()) { + compaction_job_info->table_properties[fn] = tp; + } + } + } + } + for (const auto& newf : c->edit()->GetNewFiles()) { + const FileMetaData& meta = newf.second; + const FileDescriptor& desc = meta.fd; + const uint64_t file_number = desc.GetNumber(); + compaction_job_info->output_files.push_back(TableFileName( + c->immutable_cf_options()->cf_paths, file_number, desc.GetPathId())); + compaction_job_info->output_file_infos.push_back(CompactionFileInfo{ + newf.first, file_number, meta.oldest_blob_file_number}); + } +} +#endif + +// SuperVersionContext gets created and destructed outside of the lock -- +// we use this conveniently to: +// * malloc one SuperVersion() outside of the lock -- new_superversion +// * delete SuperVersion()s outside of the lock -- superversions_to_free +// +// However, if InstallSuperVersionAndScheduleWork() gets called twice with the +// same sv_context, we can't reuse the SuperVersion() that got +// malloced because +// first call already used it. In that rare case, we take a hit and create a +// new SuperVersion() inside of the mutex. We do similar thing +// for superversion_to_free + +void DBImpl::InstallSuperVersionAndScheduleWork( + ColumnFamilyData* cfd, SuperVersionContext* sv_context, + const MutableCFOptions& mutable_cf_options) { + mutex_.AssertHeld(); + + // Update max_total_in_memory_state_ + size_t old_memtable_size = 0; + auto* old_sv = cfd->GetSuperVersion(); + if (old_sv) { + old_memtable_size = old_sv->mutable_cf_options.write_buffer_size * + old_sv->mutable_cf_options.max_write_buffer_number; + } + + // this branch is unlikely to step in + if (UNLIKELY(sv_context->new_superversion == nullptr)) { + sv_context->NewSuperVersion(); + } + cfd->InstallSuperVersion(sv_context, &mutex_, mutable_cf_options); + + // There may be a small data race here. The snapshot tricking bottommost + // compaction may already be released here. But assuming there will always be + // newer snapshot created and released frequently, the compaction will be + // triggered soon anyway. + bottommost_files_mark_threshold_ = kMaxSequenceNumber; + for (auto* my_cfd : *versions_->GetColumnFamilySet()) { + bottommost_files_mark_threshold_ = std::min( + bottommost_files_mark_threshold_, + my_cfd->current()->storage_info()->bottommost_files_mark_threshold()); + } + + // Whenever we install new SuperVersion, we might need to issue new flushes or + // compactions. + SchedulePendingCompaction(cfd); + MaybeScheduleFlushOrCompaction(); + + // Update max_total_in_memory_state_ + max_total_in_memory_state_ = max_total_in_memory_state_ - old_memtable_size + + mutable_cf_options.write_buffer_size * + mutable_cf_options.max_write_buffer_number; +} + +// ShouldPurge is called by FindObsoleteFiles when doing a full scan, +// and db mutex (mutex_) should already be held. +// Actually, the current implementation of FindObsoleteFiles with +// full_scan=true can issue I/O requests to obtain list of files in +// directories, e.g. env_->getChildren while holding db mutex. +bool DBImpl::ShouldPurge(uint64_t file_number) const { + return files_grabbed_for_purge_.find(file_number) == + files_grabbed_for_purge_.end() && + purge_files_.find(file_number) == purge_files_.end(); +} + +// MarkAsGrabbedForPurge is called by FindObsoleteFiles, and db mutex +// (mutex_) should already be held. +void DBImpl::MarkAsGrabbedForPurge(uint64_t file_number) { + files_grabbed_for_purge_.insert(file_number); +} + +void DBImpl::SetSnapshotChecker(SnapshotChecker* snapshot_checker) { + InstrumentedMutexLock l(&mutex_); + // snapshot_checker_ should only set once. If we need to set it multiple + // times, we need to make sure the old one is not deleted while it is still + // using by a compaction job. + assert(!snapshot_checker_); + snapshot_checker_.reset(snapshot_checker); +} + +void DBImpl::GetSnapshotContext( + JobContext* job_context, std::vector* snapshot_seqs, + SequenceNumber* earliest_write_conflict_snapshot, + SnapshotChecker** snapshot_checker_ptr) { + mutex_.AssertHeld(); + assert(job_context != nullptr); + assert(snapshot_seqs != nullptr); + assert(earliest_write_conflict_snapshot != nullptr); + assert(snapshot_checker_ptr != nullptr); + + *snapshot_checker_ptr = snapshot_checker_.get(); + if (use_custom_gc_ && *snapshot_checker_ptr == nullptr) { + *snapshot_checker_ptr = DisableGCSnapshotChecker::Instance(); + } + if (*snapshot_checker_ptr != nullptr) { + // If snapshot_checker is used, that means the flush/compaction may + // contain values not visible to snapshot taken after + // flush/compaction job starts. Take a snapshot and it will appear + // in snapshot_seqs and force compaction iterator to consider such + // snapshots. + const Snapshot* job_snapshot = + GetSnapshotImpl(false /*write_conflict_boundary*/, false /*lock*/); + job_context->job_snapshot.reset(new ManagedSnapshot(this, job_snapshot)); + } + *snapshot_seqs = snapshots_.GetAll(earliest_write_conflict_snapshot); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_impl/db_impl_debug.cc b/src/rocksdb/db/db_impl/db_impl_debug.cc new file mode 100644 index 000000000..610b57d39 --- /dev/null +++ b/src/rocksdb/db/db_impl/db_impl_debug.cc @@ -0,0 +1,294 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef NDEBUG + +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "db/error_handler.h" +#include "monitoring/thread_status_updater.h" +#include "util/cast_util.h" + +namespace ROCKSDB_NAMESPACE { +uint64_t DBImpl::TEST_GetLevel0TotalSize() { + InstrumentedMutexLock l(&mutex_); + return default_cf_handle_->cfd()->current()->storage_info()->NumLevelBytes(0); +} + +void DBImpl::TEST_SwitchWAL() { + WriteContext write_context; + InstrumentedMutexLock l(&mutex_); + void* writer = TEST_BeginWrite(); + SwitchWAL(&write_context); + TEST_EndWrite(writer); +} + +bool DBImpl::TEST_WALBufferIsEmpty(bool lock) { + if (lock) { + log_write_mutex_.Lock(); + } + log::Writer* cur_log_writer = logs_.back().writer; + auto res = cur_log_writer->TEST_BufferIsEmpty(); + if (lock) { + log_write_mutex_.Unlock(); + } + return res; +} + +int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes( + ColumnFamilyHandle* column_family) { + ColumnFamilyData* cfd; + if (column_family == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfh = reinterpret_cast(column_family); + cfd = cfh->cfd(); + } + InstrumentedMutexLock l(&mutex_); + return cfd->current()->storage_info()->MaxNextLevelOverlappingBytes(); +} + +void DBImpl::TEST_GetFilesMetaData( + ColumnFamilyHandle* column_family, + std::vector>* metadata) { + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + InstrumentedMutexLock l(&mutex_); + metadata->resize(NumberLevels()); + for (int level = 0; level < NumberLevels(); level++) { + const std::vector& files = + cfd->current()->storage_info()->LevelFiles(level); + + (*metadata)[level].clear(); + for (const auto& f : files) { + (*metadata)[level].push_back(*f); + } + } +} + +uint64_t DBImpl::TEST_Current_Manifest_FileNo() { + return versions_->manifest_file_number(); +} + +uint64_t DBImpl::TEST_Current_Next_FileNo() { + return versions_->current_next_file_number(); +} + +Status DBImpl::TEST_CompactRange(int level, const Slice* begin, + const Slice* end, + ColumnFamilyHandle* column_family, + bool disallow_trivial_move) { + ColumnFamilyData* cfd; + if (column_family == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfh = reinterpret_cast(column_family); + cfd = cfh->cfd(); + } + int output_level = + (cfd->ioptions()->compaction_style == kCompactionStyleUniversal || + cfd->ioptions()->compaction_style == kCompactionStyleFIFO) + ? level + : level + 1; + return RunManualCompaction(cfd, level, output_level, CompactRangeOptions(), + begin, end, true, disallow_trivial_move, + port::kMaxUint64 /*max_file_num_to_ignore*/); +} + +Status DBImpl::TEST_SwitchMemtable(ColumnFamilyData* cfd) { + WriteContext write_context; + InstrumentedMutexLock l(&mutex_); + if (cfd == nullptr) { + cfd = default_cf_handle_->cfd(); + } + + Status s; + void* writer = TEST_BeginWrite(); + if (two_write_queues_) { + WriteThread::Writer nonmem_w; + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + s = SwitchMemtable(cfd, &write_context); + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } else { + s = SwitchMemtable(cfd, &write_context); + } + TEST_EndWrite(writer); + return s; +} + +Status DBImpl::TEST_FlushMemTable(bool wait, bool allow_write_stall, + ColumnFamilyHandle* cfh) { + FlushOptions fo; + fo.wait = wait; + fo.allow_write_stall = allow_write_stall; + ColumnFamilyData* cfd; + if (cfh == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfhi = reinterpret_cast(cfh); + cfd = cfhi->cfd(); + } + return FlushMemTable(cfd, fo, FlushReason::kTest); +} + +Status DBImpl::TEST_FlushMemTable(ColumnFamilyData* cfd, + const FlushOptions& flush_opts) { + return FlushMemTable(cfd, flush_opts, FlushReason::kTest); +} + +Status DBImpl::TEST_AtomicFlushMemTables( + const autovector& cfds, const FlushOptions& flush_opts) { + return AtomicFlushMemTables(cfds, flush_opts, FlushReason::kTest); +} + +Status DBImpl::TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family) { + ColumnFamilyData* cfd; + if (column_family == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfh = reinterpret_cast(column_family); + cfd = cfh->cfd(); + } + return WaitForFlushMemTable(cfd, nullptr, false); +} + +Status DBImpl::TEST_WaitForCompact(bool wait_unscheduled) { + // Wait until the compaction completes + + // TODO: a bug here. This function actually does not necessarily + // wait for compact. It actually waits for scheduled compaction + // OR flush to finish. + + InstrumentedMutexLock l(&mutex_); + while ((bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ || + bg_flush_scheduled_ || + (wait_unscheduled && unscheduled_compactions_)) && + (error_handler_.GetBGError() == Status::OK())) { + bg_cv_.Wait(); + } + return error_handler_.GetBGError(); +} + +void DBImpl::TEST_LockMutex() { mutex_.Lock(); } + +void DBImpl::TEST_UnlockMutex() { mutex_.Unlock(); } + +void* DBImpl::TEST_BeginWrite() { + auto w = new WriteThread::Writer(); + write_thread_.EnterUnbatched(w, &mutex_); + return reinterpret_cast(w); +} + +void DBImpl::TEST_EndWrite(void* w) { + auto writer = reinterpret_cast(w); + write_thread_.ExitUnbatched(writer); + delete writer; +} + +size_t DBImpl::TEST_LogsToFreeSize() { + InstrumentedMutexLock l(&mutex_); + return logs_to_free_.size(); +} + +uint64_t DBImpl::TEST_LogfileNumber() { + InstrumentedMutexLock l(&mutex_); + return logfile_number_; +} + +Status DBImpl::TEST_GetAllImmutableCFOptions( + std::unordered_map* iopts_map) { + std::vector cf_names; + std::vector iopts; + { + InstrumentedMutexLock l(&mutex_); + for (auto cfd : *versions_->GetColumnFamilySet()) { + cf_names.push_back(cfd->GetName()); + iopts.push_back(cfd->ioptions()); + } + } + iopts_map->clear(); + for (size_t i = 0; i < cf_names.size(); ++i) { + iopts_map->insert({cf_names[i], iopts[i]}); + } + + return Status::OK(); +} + +uint64_t DBImpl::TEST_FindMinLogContainingOutstandingPrep() { + return logs_with_prep_tracker_.FindMinLogContainingOutstandingPrep(); +} + +size_t DBImpl::TEST_PreparedSectionCompletedSize() { + return logs_with_prep_tracker_.TEST_PreparedSectionCompletedSize(); +} + +size_t DBImpl::TEST_LogsWithPrepSize() { + return logs_with_prep_tracker_.TEST_LogsWithPrepSize(); +} + +uint64_t DBImpl::TEST_FindMinPrepLogReferencedByMemTable() { + autovector empty_list; + return FindMinPrepLogReferencedByMemTable(versions_.get(), nullptr, + empty_list); +} + +Status DBImpl::TEST_GetLatestMutableCFOptions( + ColumnFamilyHandle* column_family, MutableCFOptions* mutable_cf_options) { + InstrumentedMutexLock l(&mutex_); + + auto cfh = reinterpret_cast(column_family); + *mutable_cf_options = *cfh->cfd()->GetLatestMutableCFOptions(); + return Status::OK(); +} + +int DBImpl::TEST_BGCompactionsAllowed() const { + InstrumentedMutexLock l(&mutex_); + return GetBGJobLimits().max_compactions; +} + +int DBImpl::TEST_BGFlushesAllowed() const { + InstrumentedMutexLock l(&mutex_); + return GetBGJobLimits().max_flushes; +} + +SequenceNumber DBImpl::TEST_GetLastVisibleSequence() const { + if (last_seq_same_as_publish_seq_) { + return versions_->LastSequence(); + } else { + return versions_->LastAllocatedSequence(); + } +} + +size_t DBImpl::TEST_GetWalPreallocateBlockSize( + uint64_t write_buffer_size) const { + InstrumentedMutexLock l(&mutex_); + return GetWalPreallocateBlockSize(write_buffer_size); +} + +void DBImpl::TEST_WaitForDumpStatsRun(std::function callback) const { + if (thread_dump_stats_ != nullptr) { + thread_dump_stats_->TEST_WaitForRun(callback); + } +} + +void DBImpl::TEST_WaitForPersistStatsRun(std::function callback) const { + if (thread_persist_stats_ != nullptr) { + thread_persist_stats_->TEST_WaitForRun(callback); + } +} + +bool DBImpl::TEST_IsPersistentStatsEnabled() const { + return thread_persist_stats_ && thread_persist_stats_->IsRunning(); +} + +size_t DBImpl::TEST_EstimateInMemoryStatsHistorySize() const { + return EstimateInMemoryStatsHistorySize(); +} +} // namespace ROCKSDB_NAMESPACE +#endif // NDEBUG diff --git a/src/rocksdb/db/db_impl/db_impl_experimental.cc b/src/rocksdb/db/db_impl/db_impl_experimental.cc new file mode 100644 index 000000000..f0c17ce95 --- /dev/null +++ b/src/rocksdb/db/db_impl/db_impl_experimental.cc @@ -0,0 +1,151 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_impl/db_impl.h" + +#include +#include + +#include "db/column_family.h" +#include "db/job_context.h" +#include "db/version_set.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +#ifndef ROCKSDB_LITE +Status DBImpl::SuggestCompactRange(ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end) { + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + InternalKey start_key, end_key; + if (begin != nullptr) { + start_key.SetMinPossibleForUserKey(*begin); + } + if (end != nullptr) { + end_key.SetMaxPossibleForUserKey(*end); + } + { + InstrumentedMutexLock l(&mutex_); + auto vstorage = cfd->current()->storage_info(); + for (int level = 0; level < vstorage->num_non_empty_levels() - 1; ++level) { + std::vector inputs; + vstorage->GetOverlappingInputs( + level, begin == nullptr ? nullptr : &start_key, + end == nullptr ? nullptr : &end_key, &inputs); + for (auto f : inputs) { + f->marked_for_compaction = true; + } + } + // Since we have some more files to compact, we should also recompute + // compaction score + vstorage->ComputeCompactionScore(*cfd->ioptions(), + *cfd->GetLatestMutableCFOptions()); + SchedulePendingCompaction(cfd); + MaybeScheduleFlushOrCompaction(); + } + return Status::OK(); +} + +Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) { + assert(column_family); + + if (target_level < 1) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "PromoteL0 FAILED. Invalid target level %d\n", target_level); + return Status::InvalidArgument("Invalid target level"); + } + + Status status; + VersionEdit edit; + JobContext job_context(next_job_id_.fetch_add(1), true); + { + InstrumentedMutexLock l(&mutex_); + auto* cfd = static_cast(column_family)->cfd(); + const auto* vstorage = cfd->current()->storage_info(); + + if (target_level >= vstorage->num_levels()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "PromoteL0 FAILED. Target level %d does not exist\n", + target_level); + job_context.Clean(); + return Status::InvalidArgument("Target level does not exist"); + } + + // Sort L0 files by range. + const InternalKeyComparator* icmp = &cfd->internal_comparator(); + auto l0_files = vstorage->LevelFiles(0); + std::sort(l0_files.begin(), l0_files.end(), + [icmp](FileMetaData* f1, FileMetaData* f2) { + return icmp->Compare(f1->largest, f2->largest) < 0; + }); + + // Check that no L0 file is being compacted and that they have + // non-overlapping ranges. + for (size_t i = 0; i < l0_files.size(); ++i) { + auto f = l0_files[i]; + if (f->being_compacted) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "PromoteL0 FAILED. File %" PRIu64 " being compacted\n", + f->fd.GetNumber()); + job_context.Clean(); + return Status::InvalidArgument("PromoteL0 called during L0 compaction"); + } + + if (i == 0) continue; + auto prev_f = l0_files[i - 1]; + if (icmp->Compare(prev_f->largest, f->smallest) >= 0) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "PromoteL0 FAILED. Files %" PRIu64 " and %" PRIu64 + " have overlapping ranges\n", + prev_f->fd.GetNumber(), f->fd.GetNumber()); + job_context.Clean(); + return Status::InvalidArgument("L0 has overlapping files"); + } + } + + // Check that all levels up to target_level are empty. + for (int level = 1; level <= target_level; ++level) { + if (vstorage->NumLevelFiles(level) > 0) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "PromoteL0 FAILED. Level %d not empty\n", level); + job_context.Clean(); + return Status::InvalidArgument( + "All levels up to target_level " + "must be empty"); + } + } + + edit.SetColumnFamily(cfd->GetID()); + for (const auto& f : l0_files) { + edit.DeleteFile(0, f->fd.GetNumber()); + edit.AddFile(target_level, f->fd.GetNumber(), f->fd.GetPathId(), + f->fd.GetFileSize(), f->smallest, f->largest, + f->fd.smallest_seqno, f->fd.largest_seqno, + f->marked_for_compaction, f->oldest_blob_file_number, + f->oldest_ancester_time, f->file_creation_time, + f->file_checksum, f->file_checksum_func_name); + } + + status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), + &edit, &mutex_, directories_.GetDbDir()); + if (status.ok()) { + InstallSuperVersionAndScheduleWork(cfd, + &job_context.superversion_contexts[0], + *cfd->GetLatestMutableCFOptions()); + } + } // lock released here + LogFlush(immutable_db_options_.info_log); + job_context.Clean(); + + return status; +} +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_impl/db_impl_files.cc b/src/rocksdb/db/db_impl/db_impl_files.cc new file mode 100644 index 000000000..c5d07dd01 --- /dev/null +++ b/src/rocksdb/db/db_impl/db_impl_files.cc @@ -0,0 +1,667 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "db/db_impl/db_impl.h" + +#include +#include +#include +#include "db/event_helpers.h" +#include "db/memtable_list.h" +#include "file/file_util.h" +#include "file/sst_file_manager_impl.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +uint64_t DBImpl::MinLogNumberToKeep() { + if (allow_2pc()) { + return versions_->min_log_number_to_keep_2pc(); + } else { + return versions_->MinLogNumberWithUnflushedData(); + } +} + +uint64_t DBImpl::MinObsoleteSstNumberToKeep() { + mutex_.AssertHeld(); + if (!pending_outputs_.empty()) { + return *pending_outputs_.begin(); + } + return std::numeric_limits::max(); +} + +// * Returns the list of live files in 'sst_live' +// If it's doing full scan: +// * Returns the list of all files in the filesystem in +// 'full_scan_candidate_files'. +// Otherwise, gets obsolete files from VersionSet. +// no_full_scan = true -- never do the full scan using GetChildren() +// force = false -- don't force the full scan, except every +// mutable_db_options_.delete_obsolete_files_period_micros +// force = true -- force the full scan +void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, + bool no_full_scan) { + mutex_.AssertHeld(); + + // if deletion is disabled, do nothing + if (disable_delete_obsolete_files_ > 0) { + return; + } + + bool doing_the_full_scan = false; + + // logic for figuring out if we're doing the full scan + if (no_full_scan) { + doing_the_full_scan = false; + } else if (force || + mutable_db_options_.delete_obsolete_files_period_micros == 0) { + doing_the_full_scan = true; + } else { + const uint64_t now_micros = env_->NowMicros(); + if ((delete_obsolete_files_last_run_ + + mutable_db_options_.delete_obsolete_files_period_micros) < + now_micros) { + doing_the_full_scan = true; + delete_obsolete_files_last_run_ = now_micros; + } + } + + // don't delete files that might be currently written to from compaction + // threads + // Since job_context->min_pending_output is set, until file scan finishes, + // mutex_ cannot be released. Otherwise, we might see no min_pending_output + // here but later find newer generated unfinalized files while scanning. + if (!pending_outputs_.empty()) { + job_context->min_pending_output = *pending_outputs_.begin(); + } else { + // delete all of them + job_context->min_pending_output = std::numeric_limits::max(); + } + + // Get obsolete files. This function will also update the list of + // pending files in VersionSet(). + versions_->GetObsoleteFiles(&job_context->sst_delete_files, + &job_context->manifest_delete_files, + job_context->min_pending_output); + + // Mark the elements in job_context->sst_delete_files as grabbedForPurge + // so that other threads calling FindObsoleteFiles with full_scan=true + // will not add these files to candidate list for purge. + for (const auto& sst_to_del : job_context->sst_delete_files) { + MarkAsGrabbedForPurge(sst_to_del.metadata->fd.GetNumber()); + } + + // store the current filenum, lognum, etc + job_context->manifest_file_number = versions_->manifest_file_number(); + job_context->pending_manifest_file_number = + versions_->pending_manifest_file_number(); + job_context->log_number = MinLogNumberToKeep(); + job_context->prev_log_number = versions_->prev_log_number(); + + versions_->AddLiveFiles(&job_context->sst_live); + if (doing_the_full_scan) { + InfoLogPrefix info_log_prefix(!immutable_db_options_.db_log_dir.empty(), + dbname_); + std::set paths; + for (size_t path_id = 0; path_id < immutable_db_options_.db_paths.size(); + path_id++) { + paths.insert(immutable_db_options_.db_paths[path_id].path); + } + + // Note that if cf_paths is not specified in the ColumnFamilyOptions + // of a particular column family, we use db_paths as the cf_paths + // setting. Hence, there can be multiple duplicates of files from db_paths + // in the following code. The duplicate are removed while identifying + // unique files in PurgeObsoleteFiles. + for (auto cfd : *versions_->GetColumnFamilySet()) { + for (size_t path_id = 0; path_id < cfd->ioptions()->cf_paths.size(); + path_id++) { + auto& path = cfd->ioptions()->cf_paths[path_id].path; + + if (paths.find(path) == paths.end()) { + paths.insert(path); + } + } + } + + for (auto& path : paths) { + // set of all files in the directory. We'll exclude files that are still + // alive in the subsequent processings. + std::vector files; + env_->GetChildren(path, &files); // Ignore errors + for (const std::string& file : files) { + uint64_t number; + FileType type; + // 1. If we cannot parse the file name, we skip; + // 2. If the file with file_number equals number has already been + // grabbed for purge by another compaction job, or it has already been + // schedule for purge, we also skip it if we + // are doing full scan in order to avoid double deletion of the same + // file under race conditions. See + // https://github.com/facebook/rocksdb/issues/3573 + if (!ParseFileName(file, &number, info_log_prefix.prefix, &type) || + !ShouldPurge(number)) { + continue; + } + + // TODO(icanadi) clean up this mess to avoid having one-off "/" prefixes + job_context->full_scan_candidate_files.emplace_back("/" + file, path); + } + } + + // Add log files in wal_dir + if (immutable_db_options_.wal_dir != dbname_) { + std::vector log_files; + env_->GetChildren(immutable_db_options_.wal_dir, + &log_files); // Ignore errors + for (const std::string& log_file : log_files) { + job_context->full_scan_candidate_files.emplace_back( + log_file, immutable_db_options_.wal_dir); + } + } + // Add info log files in db_log_dir + if (!immutable_db_options_.db_log_dir.empty() && + immutable_db_options_.db_log_dir != dbname_) { + std::vector info_log_files; + // Ignore errors + env_->GetChildren(immutable_db_options_.db_log_dir, &info_log_files); + for (std::string& log_file : info_log_files) { + job_context->full_scan_candidate_files.emplace_back( + log_file, immutable_db_options_.db_log_dir); + } + } + } + + // logs_ is empty when called during recovery, in which case there can't yet + // be any tracked obsolete logs + if (!alive_log_files_.empty() && !logs_.empty()) { + uint64_t min_log_number = job_context->log_number; + size_t num_alive_log_files = alive_log_files_.size(); + // find newly obsoleted log files + while (alive_log_files_.begin()->number < min_log_number) { + auto& earliest = *alive_log_files_.begin(); + if (immutable_db_options_.recycle_log_file_num > + log_recycle_files_.size()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "adding log %" PRIu64 " to recycle list\n", + earliest.number); + log_recycle_files_.push_back(earliest.number); + } else { + job_context->log_delete_files.push_back(earliest.number); + } + if (job_context->size_log_to_delete == 0) { + job_context->prev_total_log_size = total_log_size_; + job_context->num_alive_log_files = num_alive_log_files; + } + job_context->size_log_to_delete += earliest.size; + total_log_size_ -= earliest.size; + if (two_write_queues_) { + log_write_mutex_.Lock(); + } + alive_log_files_.pop_front(); + if (two_write_queues_) { + log_write_mutex_.Unlock(); + } + // Current log should always stay alive since it can't have + // number < MinLogNumber(). + assert(alive_log_files_.size()); + } + while (!logs_.empty() && logs_.front().number < min_log_number) { + auto& log = logs_.front(); + if (log.getting_synced) { + log_sync_cv_.Wait(); + // logs_ could have changed while we were waiting. + continue; + } + logs_to_free_.push_back(log.ReleaseWriter()); + { + InstrumentedMutexLock wl(&log_write_mutex_); + logs_.pop_front(); + } + } + // Current log cannot be obsolete. + assert(!logs_.empty()); + } + + // We're just cleaning up for DB::Write(). + assert(job_context->logs_to_free.empty()); + job_context->logs_to_free = logs_to_free_; + job_context->log_recycle_files.assign(log_recycle_files_.begin(), + log_recycle_files_.end()); + if (job_context->HaveSomethingToDelete()) { + ++pending_purge_obsolete_files_; + } + logs_to_free_.clear(); +} + +namespace { +bool CompareCandidateFile(const JobContext::CandidateFileInfo& first, + const JobContext::CandidateFileInfo& second) { + if (first.file_name > second.file_name) { + return true; + } else if (first.file_name < second.file_name) { + return false; + } else { + return (first.file_path > second.file_path); + } +} +}; // namespace + +// Delete obsolete files and log status and information of file deletion +void DBImpl::DeleteObsoleteFileImpl(int job_id, const std::string& fname, + const std::string& path_to_sync, + FileType type, uint64_t number) { + Status file_deletion_status; + if (type == kTableFile || type == kLogFile) { + file_deletion_status = + DeleteDBFile(&immutable_db_options_, fname, path_to_sync, + /*force_bg=*/false, /*force_fg=*/!wal_in_db_path_); + } else { + file_deletion_status = env_->DeleteFile(fname); + } + TEST_SYNC_POINT_CALLBACK("DBImpl::DeleteObsoleteFileImpl:AfterDeletion", + &file_deletion_status); + if (file_deletion_status.ok()) { + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, + "[JOB %d] Delete %s type=%d #%" PRIu64 " -- %s\n", job_id, + fname.c_str(), type, number, + file_deletion_status.ToString().c_str()); + } else if (env_->FileExists(fname).IsNotFound()) { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "[JOB %d] Tried to delete a non-existing file %s type=%d #%" PRIu64 + " -- %s\n", + job_id, fname.c_str(), type, number, + file_deletion_status.ToString().c_str()); + } else { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "[JOB %d] Failed to delete %s type=%d #%" PRIu64 " -- %s\n", + job_id, fname.c_str(), type, number, + file_deletion_status.ToString().c_str()); + } + if (type == kTableFile) { + EventHelpers::LogAndNotifyTableFileDeletion( + &event_logger_, job_id, number, fname, file_deletion_status, GetName(), + immutable_db_options_.listeners); + } +} + +// Diffs the files listed in filenames and those that do not +// belong to live files are possibly removed. Also, removes all the +// files in sst_delete_files and log_delete_files. +// It is not necessary to hold the mutex when invoking this method. +void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) { + TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:Begin"); + // we'd better have sth to delete + assert(state.HaveSomethingToDelete()); + + // FindObsoleteFiles() should've populated this so nonzero + assert(state.manifest_file_number != 0); + + // Now, convert live list to an unordered map, WITHOUT mutex held; + // set is slow. + std::unordered_map sst_live_map; + for (const FileDescriptor& fd : state.sst_live) { + sst_live_map[fd.GetNumber()] = &fd; + } + std::unordered_set log_recycle_files_set( + state.log_recycle_files.begin(), state.log_recycle_files.end()); + + auto candidate_files = state.full_scan_candidate_files; + candidate_files.reserve( + candidate_files.size() + state.sst_delete_files.size() + + state.log_delete_files.size() + state.manifest_delete_files.size()); + // We may ignore the dbname when generating the file names. + for (auto& file : state.sst_delete_files) { + candidate_files.emplace_back( + MakeTableFileName(file.metadata->fd.GetNumber()), file.path); + if (file.metadata->table_reader_handle) { + table_cache_->Release(file.metadata->table_reader_handle); + } + file.DeleteMetadata(); + } + + for (auto file_num : state.log_delete_files) { + if (file_num > 0) { + candidate_files.emplace_back(LogFileName(file_num), + immutable_db_options_.wal_dir); + } + } + for (const auto& filename : state.manifest_delete_files) { + candidate_files.emplace_back(filename, dbname_); + } + + // dedup state.candidate_files so we don't try to delete the same + // file twice + std::sort(candidate_files.begin(), candidate_files.end(), + CompareCandidateFile); + candidate_files.erase( + std::unique(candidate_files.begin(), candidate_files.end()), + candidate_files.end()); + + if (state.prev_total_log_size > 0) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[JOB %d] Try to delete WAL files size %" PRIu64 + ", prev total WAL file size %" PRIu64 + ", number of live WAL files %" ROCKSDB_PRIszt ".\n", + state.job_id, state.size_log_to_delete, + state.prev_total_log_size, state.num_alive_log_files); + } + + std::vector old_info_log_files; + InfoLogPrefix info_log_prefix(!immutable_db_options_.db_log_dir.empty(), + dbname_); + + // File numbers of most recent two OPTIONS file in candidate_files (found in + // previos FindObsoleteFiles(full_scan=true)) + // At this point, there must not be any duplicate file numbers in + // candidate_files. + uint64_t optsfile_num1 = std::numeric_limits::min(); + uint64_t optsfile_num2 = std::numeric_limits::min(); + for (const auto& candidate_file : candidate_files) { + const std::string& fname = candidate_file.file_name; + uint64_t number; + FileType type; + if (!ParseFileName(fname, &number, info_log_prefix.prefix, &type) || + type != kOptionsFile) { + continue; + } + if (number > optsfile_num1) { + optsfile_num2 = optsfile_num1; + optsfile_num1 = number; + } else if (number > optsfile_num2) { + optsfile_num2 = number; + } + } + + // Close WALs before trying to delete them. + for (const auto w : state.logs_to_free) { + // TODO: maybe check the return value of Close. + w->Close(); + } + + bool own_files = OwnTablesAndLogs(); + std::unordered_set files_to_del; + for (const auto& candidate_file : candidate_files) { + const std::string& to_delete = candidate_file.file_name; + uint64_t number; + FileType type; + // Ignore file if we cannot recognize it. + if (!ParseFileName(to_delete, &number, info_log_prefix.prefix, &type)) { + continue; + } + + bool keep = true; + switch (type) { + case kLogFile: + keep = ((number >= state.log_number) || + (number == state.prev_log_number) || + (log_recycle_files_set.find(number) != + log_recycle_files_set.end())); + break; + case kDescriptorFile: + // Keep my manifest file, and any newer incarnations' + // (can happen during manifest roll) + keep = (number >= state.manifest_file_number); + break; + case kTableFile: + // If the second condition is not there, this makes + // DontDeletePendingOutputs fail + keep = (sst_live_map.find(number) != sst_live_map.end()) || + number >= state.min_pending_output; + if (!keep) { + files_to_del.insert(number); + } + break; + case kTempFile: + // Any temp files that are currently being written to must + // be recorded in pending_outputs_, which is inserted into "live". + // Also, SetCurrentFile creates a temp file when writing out new + // manifest, which is equal to state.pending_manifest_file_number. We + // should not delete that file + // + // TODO(yhchiang): carefully modify the third condition to safely + // remove the temp options files. + keep = (sst_live_map.find(number) != sst_live_map.end()) || + (number == state.pending_manifest_file_number) || + (to_delete.find(kOptionsFileNamePrefix) != std::string::npos); + break; + case kInfoLogFile: + keep = true; + if (number != 0) { + old_info_log_files.push_back(to_delete); + } + break; + case kOptionsFile: + keep = (number >= optsfile_num2); + TEST_SYNC_POINT_CALLBACK( + "DBImpl::PurgeObsoleteFiles:CheckOptionsFiles:1", + reinterpret_cast(&number)); + TEST_SYNC_POINT_CALLBACK( + "DBImpl::PurgeObsoleteFiles:CheckOptionsFiles:2", + reinterpret_cast(&keep)); + break; + case kCurrentFile: + case kDBLockFile: + case kIdentityFile: + case kMetaDatabase: + case kBlobFile: + keep = true; + break; + } + + if (keep) { + continue; + } + + std::string fname; + std::string dir_to_sync; + if (type == kTableFile) { + // evict from cache + TableCache::Evict(table_cache_.get(), number); + fname = MakeTableFileName(candidate_file.file_path, number); + dir_to_sync = candidate_file.file_path; + } else { + dir_to_sync = + (type == kLogFile) ? immutable_db_options_.wal_dir : dbname_; + fname = dir_to_sync + + ((!dir_to_sync.empty() && dir_to_sync.back() == '/') || + (!to_delete.empty() && to_delete.front() == '/') + ? "" + : "/") + + to_delete; + } + +#ifndef ROCKSDB_LITE + if (type == kLogFile && (immutable_db_options_.wal_ttl_seconds > 0 || + immutable_db_options_.wal_size_limit_mb > 0)) { + wal_manager_.ArchiveWALFile(fname, number); + continue; + } +#endif // !ROCKSDB_LITE + + // If I do not own these files, e.g. secondary instance with max_open_files + // = -1, then no need to delete or schedule delete these files since they + // will be removed by their owner, e.g. the primary instance. + if (!own_files) { + continue; + } + Status file_deletion_status; + if (schedule_only) { + InstrumentedMutexLock guard_lock(&mutex_); + SchedulePendingPurge(fname, dir_to_sync, type, number, state.job_id); + } else { + DeleteObsoleteFileImpl(state.job_id, fname, dir_to_sync, type, number); + } + } + + { + // After purging obsolete files, remove them from files_grabbed_for_purge_. + InstrumentedMutexLock guard_lock(&mutex_); + autovector to_be_removed; + for (auto fn : files_grabbed_for_purge_) { + if (files_to_del.count(fn) != 0) { + to_be_removed.emplace_back(fn); + } + } + for (auto fn : to_be_removed) { + files_grabbed_for_purge_.erase(fn); + } + } + + // Delete old info log files. + size_t old_info_log_file_count = old_info_log_files.size(); + if (old_info_log_file_count != 0 && + old_info_log_file_count >= immutable_db_options_.keep_log_file_num) { + std::sort(old_info_log_files.begin(), old_info_log_files.end()); + size_t end = + old_info_log_file_count - immutable_db_options_.keep_log_file_num; + for (unsigned int i = 0; i <= end; i++) { + std::string& to_delete = old_info_log_files.at(i); + std::string full_path_to_delete = + (immutable_db_options_.db_log_dir.empty() + ? dbname_ + : immutable_db_options_.db_log_dir) + + "/" + to_delete; + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[JOB %d] Delete info log file %s\n", state.job_id, + full_path_to_delete.c_str()); + Status s = env_->DeleteFile(full_path_to_delete); + if (!s.ok()) { + if (env_->FileExists(full_path_to_delete).IsNotFound()) { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "[JOB %d] Tried to delete non-existing info log file %s FAILED " + "-- %s\n", + state.job_id, to_delete.c_str(), s.ToString().c_str()); + } else { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "[JOB %d] Delete info log file %s FAILED -- %s\n", + state.job_id, to_delete.c_str(), + s.ToString().c_str()); + } + } + } + } +#ifndef ROCKSDB_LITE + wal_manager_.PurgeObsoleteWALFiles(); +#endif // ROCKSDB_LITE + LogFlush(immutable_db_options_.info_log); + InstrumentedMutexLock l(&mutex_); + --pending_purge_obsolete_files_; + assert(pending_purge_obsolete_files_ >= 0); + if (pending_purge_obsolete_files_ == 0) { + bg_cv_.SignalAll(); + } + TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:End"); +} + +void DBImpl::DeleteObsoleteFiles() { + mutex_.AssertHeld(); + JobContext job_context(next_job_id_.fetch_add(1)); + FindObsoleteFiles(&job_context, true); + + mutex_.Unlock(); + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); + } + job_context.Clean(); + mutex_.Lock(); +} + +uint64_t FindMinPrepLogReferencedByMemTable( + VersionSet* vset, const ColumnFamilyData* cfd_to_flush, + const autovector& memtables_to_flush) { + uint64_t min_log = 0; + + // we must look through the memtables for two phase transactions + // that have been committed but not yet flushed + for (auto loop_cfd : *vset->GetColumnFamilySet()) { + if (loop_cfd->IsDropped() || loop_cfd == cfd_to_flush) { + continue; + } + + auto log = loop_cfd->imm()->PrecomputeMinLogContainingPrepSection( + memtables_to_flush); + + if (log > 0 && (min_log == 0 || log < min_log)) { + min_log = log; + } + + log = loop_cfd->mem()->GetMinLogContainingPrepSection(); + + if (log > 0 && (min_log == 0 || log < min_log)) { + min_log = log; + } + } + + return min_log; +} + +uint64_t PrecomputeMinLogNumberToKeep( + VersionSet* vset, const ColumnFamilyData& cfd_to_flush, + autovector edit_list, + const autovector& memtables_to_flush, + LogsWithPrepTracker* prep_tracker) { + assert(vset != nullptr); + assert(prep_tracker != nullptr); + // Calculate updated min_log_number_to_keep + // Since the function should only be called in 2pc mode, log number in + // the version edit should be sufficient. + + // Precompute the min log number containing unflushed data for the column + // family being flushed (`cfd_to_flush`). + uint64_t cf_min_log_number_to_keep = 0; + for (auto& e : edit_list) { + if (e->HasLogNumber()) { + cf_min_log_number_to_keep = + std::max(cf_min_log_number_to_keep, e->GetLogNumber()); + } + } + if (cf_min_log_number_to_keep == 0) { + // No version edit contains information on log number. The log number + // for this column family should stay the same as it is. + cf_min_log_number_to_keep = cfd_to_flush.GetLogNumber(); + } + + // Get min log number containing unflushed data for other column families. + uint64_t min_log_number_to_keep = + vset->PreComputeMinLogNumberWithUnflushedData(&cfd_to_flush); + if (cf_min_log_number_to_keep != 0) { + min_log_number_to_keep = + std::min(cf_min_log_number_to_keep, min_log_number_to_keep); + } + + // if are 2pc we must consider logs containing prepared + // sections of outstanding transactions. + // + // We must check min logs with outstanding prep before we check + // logs references by memtables because a log referenced by the + // first data structure could transition to the second under us. + // + // TODO: iterating over all column families under db mutex. + // should find more optimal solution + auto min_log_in_prep_heap = + prep_tracker->FindMinLogContainingOutstandingPrep(); + + if (min_log_in_prep_heap != 0 && + min_log_in_prep_heap < min_log_number_to_keep) { + min_log_number_to_keep = min_log_in_prep_heap; + } + + uint64_t min_log_refed_by_mem = FindMinPrepLogReferencedByMemTable( + vset, &cfd_to_flush, memtables_to_flush); + + if (min_log_refed_by_mem != 0 && + min_log_refed_by_mem < min_log_number_to_keep) { + min_log_number_to_keep = min_log_refed_by_mem; + } + return min_log_number_to_keep; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_impl/db_impl_open.cc b/src/rocksdb/db/db_impl/db_impl_open.cc new file mode 100644 index 000000000..6ae4ead54 --- /dev/null +++ b/src/rocksdb/db/db_impl/db_impl_open.cc @@ -0,0 +1,1651 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "db/db_impl/db_impl.h" + +#include + +#include "db/builder.h" +#include "db/error_handler.h" +#include "env/composite_env_wrapper.h" +#include "file/read_write_util.h" +#include "file/sst_file_manager_impl.h" +#include "file/writable_file_writer.h" +#include "monitoring/persistent_stats_history.h" +#include "options/options_helper.h" +#include "rocksdb/wal_filter.h" +#include "table/block_based/block_based_table_factory.h" +#include "test_util/sync_point.h" +#include "util/rate_limiter.h" + +namespace ROCKSDB_NAMESPACE { +Options SanitizeOptions(const std::string& dbname, const Options& src) { + auto db_options = SanitizeOptions(dbname, DBOptions(src)); + ImmutableDBOptions immutable_db_options(db_options); + auto cf_options = + SanitizeOptions(immutable_db_options, ColumnFamilyOptions(src)); + return Options(db_options, cf_options); +} + +DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) { + DBOptions result(src); + + if (result.file_system == nullptr) { + if (result.env == Env::Default()) { + result.file_system = FileSystem::Default(); + } else { + result.file_system.reset(new LegacyFileSystemWrapper(result.env)); + } + } else { + if (result.env == nullptr) { + result.env = Env::Default(); + } + } + + // result.max_open_files means an "infinite" open files. + if (result.max_open_files != -1) { + int max_max_open_files = port::GetMaxOpenFiles(); + if (max_max_open_files == -1) { + max_max_open_files = 0x400000; + } + ClipToRange(&result.max_open_files, 20, max_max_open_files); + TEST_SYNC_POINT_CALLBACK("SanitizeOptions::AfterChangeMaxOpenFiles", + &result.max_open_files); + } + + if (result.info_log == nullptr) { + Status s = CreateLoggerFromOptions(dbname, result, &result.info_log); + if (!s.ok()) { + // No place suitable for logging + result.info_log = nullptr; + } + } + + if (!result.write_buffer_manager) { + result.write_buffer_manager.reset( + new WriteBufferManager(result.db_write_buffer_size)); + } + auto bg_job_limits = DBImpl::GetBGJobLimits( + result.max_background_flushes, result.max_background_compactions, + result.max_background_jobs, true /* parallelize_compactions */); + result.env->IncBackgroundThreadsIfNeeded(bg_job_limits.max_compactions, + Env::Priority::LOW); + result.env->IncBackgroundThreadsIfNeeded(bg_job_limits.max_flushes, + Env::Priority::HIGH); + + if (result.rate_limiter.get() != nullptr) { + if (result.bytes_per_sync == 0) { + result.bytes_per_sync = 1024 * 1024; + } + } + + if (result.delayed_write_rate == 0) { + if (result.rate_limiter.get() != nullptr) { + result.delayed_write_rate = result.rate_limiter->GetBytesPerSecond(); + } + if (result.delayed_write_rate == 0) { + result.delayed_write_rate = 16 * 1024 * 1024; + } + } + + if (result.WAL_ttl_seconds > 0 || result.WAL_size_limit_MB > 0) { + result.recycle_log_file_num = false; + } + + if (result.recycle_log_file_num && + (result.wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery || + result.wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency)) { + // kPointInTimeRecovery is inconsistent with recycle log file feature since + // we define the "end" of the log as the first corrupt record we encounter. + // kAbsoluteConsistency doesn't make sense because even a clean + // shutdown leaves old junk at the end of the log file. + result.recycle_log_file_num = 0; + } + + if (result.wal_dir.empty()) { + // Use dbname as default + result.wal_dir = dbname; + } + if (result.wal_dir.back() == '/') { + result.wal_dir = result.wal_dir.substr(0, result.wal_dir.size() - 1); + } + + if (result.db_paths.size() == 0) { + result.db_paths.emplace_back(dbname, std::numeric_limits::max()); + } + + if (result.use_direct_reads && result.compaction_readahead_size == 0) { + TEST_SYNC_POINT_CALLBACK("SanitizeOptions:direct_io", nullptr); + result.compaction_readahead_size = 1024 * 1024 * 2; + } + + if (result.compaction_readahead_size > 0 || result.use_direct_reads) { + result.new_table_reader_for_compaction_inputs = true; + } + + // Force flush on DB open if 2PC is enabled, since with 2PC we have no + // guarantee that consecutive log files have consecutive sequence id, which + // make recovery complicated. + if (result.allow_2pc) { + result.avoid_flush_during_recovery = false; + } + +#ifndef ROCKSDB_LITE + ImmutableDBOptions immutable_db_options(result); + if (!IsWalDirSameAsDBPath(&immutable_db_options)) { + // Either the WAL dir and db_paths[0]/db_name are not the same, or we + // cannot tell for sure. In either case, assume they're different and + // explicitly cleanup the trash log files (bypass DeleteScheduler) + // Do this first so even if we end up calling + // DeleteScheduler::CleanupDirectory on the same dir later, it will be + // safe + std::vector filenames; + result.env->GetChildren(result.wal_dir, &filenames); + for (std::string& filename : filenames) { + if (filename.find(".log.trash", filename.length() - + std::string(".log.trash").length()) != + std::string::npos) { + std::string trash_file = result.wal_dir + "/" + filename; + result.env->DeleteFile(trash_file); + } + } + } + // When the DB is stopped, it's possible that there are some .trash files that + // were not deleted yet, when we open the DB we will find these .trash files + // and schedule them to be deleted (or delete immediately if SstFileManager + // was not used) + auto sfm = static_cast(result.sst_file_manager.get()); + for (size_t i = 0; i < result.db_paths.size(); i++) { + DeleteScheduler::CleanupDirectory(result.env, sfm, result.db_paths[i].path); + } + + // Create a default SstFileManager for purposes of tracking compaction size + // and facilitating recovery from out of space errors. + if (result.sst_file_manager.get() == nullptr) { + std::shared_ptr sst_file_manager( + NewSstFileManager(result.env, result.info_log)); + result.sst_file_manager = sst_file_manager; + } +#endif + + if (!result.paranoid_checks) { + result.skip_checking_sst_file_sizes_on_db_open = true; + ROCKS_LOG_INFO(result.info_log, + "file size check will be skipped during open."); + } + + return result; +} + +namespace { +Status SanitizeOptionsByTable( + const DBOptions& db_opts, + const std::vector& column_families) { + Status s; + for (auto cf : column_families) { + s = cf.options.table_factory->SanitizeOptions(db_opts, cf.options); + if (!s.ok()) { + return s; + } + } + return Status::OK(); +} +} // namespace + +Status DBImpl::ValidateOptions( + const DBOptions& db_options, + const std::vector& column_families) { + Status s; + for (auto& cfd : column_families) { + s = ColumnFamilyData::ValidateOptions(db_options, cfd.options); + if (!s.ok()) { + return s; + } + } + s = ValidateOptions(db_options); + return s; +} + +Status DBImpl::ValidateOptions(const DBOptions& db_options) { + if (db_options.db_paths.size() > 4) { + return Status::NotSupported( + "More than four DB paths are not supported yet. "); + } + + if (db_options.allow_mmap_reads && db_options.use_direct_reads) { + // Protect against assert in PosixMMapReadableFile constructor + return Status::NotSupported( + "If memory mapped reads (allow_mmap_reads) are enabled " + "then direct I/O reads (use_direct_reads) must be disabled. "); + } + + if (db_options.allow_mmap_writes && + db_options.use_direct_io_for_flush_and_compaction) { + return Status::NotSupported( + "If memory mapped writes (allow_mmap_writes) are enabled " + "then direct I/O writes (use_direct_io_for_flush_and_compaction) must " + "be disabled. "); + } + + if (db_options.keep_log_file_num == 0) { + return Status::InvalidArgument("keep_log_file_num must be greater than 0"); + } + + if (db_options.unordered_write && + !db_options.allow_concurrent_memtable_write) { + return Status::InvalidArgument( + "unordered_write is incompatible with !allow_concurrent_memtable_write"); + } + + if (db_options.unordered_write && db_options.enable_pipelined_write) { + return Status::InvalidArgument( + "unordered_write is incompatible with enable_pipelined_write"); + } + + if (db_options.atomic_flush && db_options.enable_pipelined_write) { + return Status::InvalidArgument( + "atomic_flush is incompatible with enable_pipelined_write"); + } + + return Status::OK(); +} + +Status DBImpl::NewDB() { + VersionEdit new_db; + Status s = SetIdentityFile(env_, dbname_); + if (!s.ok()) { + return s; + } + if (immutable_db_options_.write_dbid_to_manifest) { + std::string temp_db_id; + GetDbIdentityFromIdentityFile(&temp_db_id); + new_db.SetDBId(temp_db_id); + } + new_db.SetLogNumber(0); + new_db.SetNextFile(2); + new_db.SetLastSequence(0); + + ROCKS_LOG_INFO(immutable_db_options_.info_log, "Creating manifest 1 \n"); + const std::string manifest = DescriptorFileName(dbname_, 1); + { + std::unique_ptr file; + FileOptions file_options = fs_->OptimizeForManifestWrite(file_options_); + s = NewWritableFile(fs_.get(), manifest, &file, file_options); + if (!s.ok()) { + return s; + } + file->SetPreallocationBlockSize( + immutable_db_options_.manifest_preallocation_size); + std::unique_ptr file_writer(new WritableFileWriter( + std::move(file), manifest, file_options, env_, nullptr /* stats */, + immutable_db_options_.listeners)); + log::Writer log(std::move(file_writer), 0, false); + std::string record; + new_db.EncodeTo(&record); + s = log.AddRecord(record); + if (s.ok()) { + s = SyncManifest(env_, &immutable_db_options_, log.file()); + } + } + if (s.ok()) { + // Make "CURRENT" file that points to the new manifest file. + s = SetCurrentFile(env_, dbname_, 1, directories_.GetDbDir()); + } else { + fs_->DeleteFile(manifest, IOOptions(), nullptr); + } + return s; +} + +Status DBImpl::CreateAndNewDirectory(Env* env, const std::string& dirname, + std::unique_ptr* directory) { + // We call CreateDirIfMissing() as the directory may already exist (if we + // are reopening a DB), when this happens we don't want creating the + // directory to cause an error. However, we need to check if creating the + // directory fails or else we may get an obscure message about the lock + // file not existing. One real-world example of this occurring is if + // env->CreateDirIfMissing() doesn't create intermediate directories, e.g. + // when dbname_ is "dir/db" but when "dir" doesn't exist. + Status s = env->CreateDirIfMissing(dirname); + if (!s.ok()) { + return s; + } + return env->NewDirectory(dirname, directory); +} + +Status Directories::SetDirectories(Env* env, const std::string& dbname, + const std::string& wal_dir, + const std::vector& data_paths) { + Status s = DBImpl::CreateAndNewDirectory(env, dbname, &db_dir_); + if (!s.ok()) { + return s; + } + if (!wal_dir.empty() && dbname != wal_dir) { + s = DBImpl::CreateAndNewDirectory(env, wal_dir, &wal_dir_); + if (!s.ok()) { + return s; + } + } + + data_dirs_.clear(); + for (auto& p : data_paths) { + const std::string db_path = p.path; + if (db_path == dbname) { + data_dirs_.emplace_back(nullptr); + } else { + std::unique_ptr path_directory; + s = DBImpl::CreateAndNewDirectory(env, db_path, &path_directory); + if (!s.ok()) { + return s; + } + data_dirs_.emplace_back(path_directory.release()); + } + } + assert(data_dirs_.size() == data_paths.size()); + return Status::OK(); +} + +Status DBImpl::Recover( + const std::vector& column_families, bool read_only, + bool error_if_log_file_exist, bool error_if_data_exists_in_logs, + uint64_t* recovered_seq) { + mutex_.AssertHeld(); + + bool is_new_db = false; + assert(db_lock_ == nullptr); + if (!read_only) { + Status s = directories_.SetDirectories(env_, dbname_, + immutable_db_options_.wal_dir, + immutable_db_options_.db_paths); + if (!s.ok()) { + return s; + } + + s = env_->LockFile(LockFileName(dbname_), &db_lock_); + if (!s.ok()) { + return s; + } + + std::string current_fname = CurrentFileName(dbname_); + s = env_->FileExists(current_fname); + if (s.IsNotFound()) { + if (immutable_db_options_.create_if_missing) { + s = NewDB(); + is_new_db = true; + if (!s.ok()) { + return s; + } + } else { + return Status::InvalidArgument( + current_fname, "does not exist (create_if_missing is false)"); + } + } else if (s.ok()) { + if (immutable_db_options_.error_if_exists) { + return Status::InvalidArgument(dbname_, + "exists (error_if_exists is true)"); + } + } else { + // Unexpected error reading file + assert(s.IsIOError()); + return s; + } + // Verify compatibility of file_options_ and filesystem + { + std::unique_ptr idfile; + FileOptions customized_fs(file_options_); + customized_fs.use_direct_reads |= + immutable_db_options_.use_direct_io_for_flush_and_compaction; + s = fs_->NewRandomAccessFile(current_fname, customized_fs, &idfile, + nullptr); + if (!s.ok()) { + std::string error_str = s.ToString(); + // Check if unsupported Direct I/O is the root cause + customized_fs.use_direct_reads = false; + s = fs_->NewRandomAccessFile(current_fname, customized_fs, &idfile, + nullptr); + if (s.ok()) { + return Status::InvalidArgument( + "Direct I/O is not supported by the specified DB."); + } else { + return Status::InvalidArgument( + "Found options incompatible with filesystem", error_str.c_str()); + } + } + } + } + assert(db_id_.empty()); + Status s = versions_->Recover(column_families, read_only, &db_id_); + if (!s.ok()) { + return s; + } + // Happens when immutable_db_options_.write_dbid_to_manifest is set to true + // the very first time. + if (db_id_.empty()) { + // Check for the IDENTITY file and create it if not there. + s = fs_->FileExists(IdentityFileName(dbname_), IOOptions(), nullptr); + // Typically Identity file is created in NewDB() and for some reason if + // it is no longer available then at this point DB ID is not in Identity + // file or Manifest. + if (s.IsNotFound()) { + s = SetIdentityFile(env_, dbname_); + if (!s.ok()) { + return s; + } + } else if (!s.ok()) { + assert(s.IsIOError()); + return s; + } + s = GetDbIdentityFromIdentityFile(&db_id_); + if (immutable_db_options_.write_dbid_to_manifest && s.ok()) { + VersionEdit edit; + edit.SetDBId(db_id_); + Options options; + MutableCFOptions mutable_cf_options(options); + versions_->db_id_ = db_id_; + s = versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), + mutable_cf_options, &edit, &mutex_, nullptr, + false); + } + } else { + s = SetIdentityFile(env_, dbname_, db_id_); + } + + if (immutable_db_options_.paranoid_checks && s.ok()) { + s = CheckConsistency(); + } + if (s.ok() && !read_only) { + std::map> created_dirs; + for (auto cfd : *versions_->GetColumnFamilySet()) { + s = cfd->AddDirectories(&created_dirs); + if (!s.ok()) { + return s; + } + } + } + // DB mutex is already held + if (s.ok() && immutable_db_options_.persist_stats_to_disk) { + s = InitPersistStatsColumnFamily(); + } + + if (s.ok()) { + // Initial max_total_in_memory_state_ before recovery logs. Log recovery + // may check this value to decide whether to flush. + max_total_in_memory_state_ = 0; + for (auto cfd : *versions_->GetColumnFamilySet()) { + auto* mutable_cf_options = cfd->GetLatestMutableCFOptions(); + max_total_in_memory_state_ += mutable_cf_options->write_buffer_size * + mutable_cf_options->max_write_buffer_number; + } + + SequenceNumber next_sequence(kMaxSequenceNumber); + default_cf_handle_ = new ColumnFamilyHandleImpl( + versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_); + default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats(); + // TODO(Zhongyi): handle single_column_family_mode_ when + // persistent_stats is enabled + single_column_family_mode_ = + versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1; + + // Recover from all newer log files than the ones named in the + // descriptor (new log files may have been added by the previous + // incarnation without registering them in the descriptor). + // + // Note that prev_log_number() is no longer used, but we pay + // attention to it in case we are recovering a database + // produced by an older version of rocksdb. + std::vector filenames; + s = env_->GetChildren(immutable_db_options_.wal_dir, &filenames); + if (s.IsNotFound()) { + return Status::InvalidArgument("wal_dir not found", + immutable_db_options_.wal_dir); + } else if (!s.ok()) { + return s; + } + + std::vector logs; + for (size_t i = 0; i < filenames.size(); i++) { + uint64_t number; + FileType type; + if (ParseFileName(filenames[i], &number, &type) && type == kLogFile) { + if (is_new_db) { + return Status::Corruption( + "While creating a new Db, wal_dir contains " + "existing log file: ", + filenames[i]); + } else { + logs.push_back(number); + } + } + } + + if (logs.size() > 0) { + if (error_if_log_file_exist) { + return Status::Corruption( + "The db was opened in readonly mode with error_if_log_file_exist" + "flag but a log file already exists"); + } else if (error_if_data_exists_in_logs) { + for (auto& log : logs) { + std::string fname = LogFileName(immutable_db_options_.wal_dir, log); + uint64_t bytes; + s = env_->GetFileSize(fname, &bytes); + if (s.ok()) { + if (bytes > 0) { + return Status::Corruption( + "error_if_data_exists_in_logs is set but there are data " + " in log files."); + } + } + } + } + } + + if (!logs.empty()) { + // Recover in the order in which the logs were generated + std::sort(logs.begin(), logs.end()); + bool corrupted_log_found = false; + s = RecoverLogFiles(logs, &next_sequence, read_only, + &corrupted_log_found); + if (corrupted_log_found && recovered_seq != nullptr) { + *recovered_seq = next_sequence; + } + if (!s.ok()) { + // Clear memtables if recovery failed + for (auto cfd : *versions_->GetColumnFamilySet()) { + cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(), + kMaxSequenceNumber); + } + } + } + } + + if (read_only) { + // If we are opening as read-only, we need to update options_file_number_ + // to reflect the most recent OPTIONS file. It does not matter for regular + // read-write db instance because options_file_number_ will later be + // updated to versions_->NewFileNumber() in RenameTempFileToOptionsFile. + std::vector file_names; + if (s.ok()) { + s = env_->GetChildren(GetName(), &file_names); + } + if (s.ok()) { + uint64_t number = 0; + uint64_t options_file_number = 0; + FileType type; + for (const auto& fname : file_names) { + if (ParseFileName(fname, &number, &type) && type == kOptionsFile) { + options_file_number = std::max(number, options_file_number); + } + } + versions_->options_file_number_ = options_file_number; + } + } + + return s; +} + +Status DBImpl::PersistentStatsProcessFormatVersion() { + mutex_.AssertHeld(); + Status s; + // persist version when stats CF doesn't exist + bool should_persist_format_version = !persistent_stats_cfd_exists_; + mutex_.Unlock(); + if (persistent_stats_cfd_exists_) { + // Check persistent stats format version compatibility. Drop and recreate + // persistent stats CF if format version is incompatible + uint64_t format_version_recovered = 0; + Status s_format = DecodePersistentStatsVersionNumber( + this, StatsVersionKeyType::kFormatVersion, &format_version_recovered); + uint64_t compatible_version_recovered = 0; + Status s_compatible = DecodePersistentStatsVersionNumber( + this, StatsVersionKeyType::kCompatibleVersion, + &compatible_version_recovered); + // abort reading from existing stats CF if any of following is true: + // 1. failed to read format version or compatible version from disk + // 2. sst's format version is greater than current format version, meaning + // this sst is encoded with a newer RocksDB release, and current compatible + // version is below the sst's compatible version + if (!s_format.ok() || !s_compatible.ok() || + (kStatsCFCurrentFormatVersion < format_version_recovered && + kStatsCFCompatibleFormatVersion < compatible_version_recovered)) { + if (!s_format.ok() || !s_compatible.ok()) { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "Reading persistent stats version key failed. Format key: %s, " + "compatible key: %s", + s_format.ToString().c_str(), s_compatible.ToString().c_str()); + } else { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "Disable persistent stats due to corrupted or incompatible format " + "version\n"); + } + DropColumnFamily(persist_stats_cf_handle_); + DestroyColumnFamilyHandle(persist_stats_cf_handle_); + ColumnFamilyHandle* handle = nullptr; + ColumnFamilyOptions cfo; + OptimizeForPersistentStats(&cfo); + s = CreateColumnFamily(cfo, kPersistentStatsColumnFamilyName, &handle); + persist_stats_cf_handle_ = static_cast(handle); + // should also persist version here because old stats CF is discarded + should_persist_format_version = true; + } + } + if (s.ok() && should_persist_format_version) { + // Persistent stats CF being created for the first time, need to write + // format version key + WriteBatch batch; + batch.Put(persist_stats_cf_handle_, kFormatVersionKeyString, + ToString(kStatsCFCurrentFormatVersion)); + batch.Put(persist_stats_cf_handle_, kCompatibleVersionKeyString, + ToString(kStatsCFCompatibleFormatVersion)); + WriteOptions wo; + wo.low_pri = true; + wo.no_slowdown = true; + wo.sync = false; + s = Write(wo, &batch); + } + mutex_.Lock(); + return s; +} + +Status DBImpl::InitPersistStatsColumnFamily() { + mutex_.AssertHeld(); + assert(!persist_stats_cf_handle_); + ColumnFamilyData* persistent_stats_cfd = + versions_->GetColumnFamilySet()->GetColumnFamily( + kPersistentStatsColumnFamilyName); + persistent_stats_cfd_exists_ = persistent_stats_cfd != nullptr; + + Status s; + if (persistent_stats_cfd != nullptr) { + // We are recovering from a DB which already contains persistent stats CF, + // the CF is already created in VersionSet::ApplyOneVersionEdit, but + // column family handle was not. Need to explicitly create handle here. + persist_stats_cf_handle_ = + new ColumnFamilyHandleImpl(persistent_stats_cfd, this, &mutex_); + } else { + mutex_.Unlock(); + ColumnFamilyHandle* handle = nullptr; + ColumnFamilyOptions cfo; + OptimizeForPersistentStats(&cfo); + s = CreateColumnFamily(cfo, kPersistentStatsColumnFamilyName, &handle); + persist_stats_cf_handle_ = static_cast(handle); + mutex_.Lock(); + } + return s; +} + +// REQUIRES: log_numbers are sorted in ascending order +Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, + SequenceNumber* next_sequence, bool read_only, + bool* corrupted_log_found) { + struct LogReporter : public log::Reader::Reporter { + Env* env; + Logger* info_log; + const char* fname; + Status* status; // nullptr if immutable_db_options_.paranoid_checks==false + void Corruption(size_t bytes, const Status& s) override { + ROCKS_LOG_WARN(info_log, "%s%s: dropping %d bytes; %s", + (this->status == nullptr ? "(ignoring error) " : ""), + fname, static_cast(bytes), s.ToString().c_str()); + if (this->status != nullptr && this->status->ok()) { + *this->status = s; + } + } + }; + + mutex_.AssertHeld(); + Status status; + std::unordered_map version_edits; + // no need to refcount because iteration is under mutex + for (auto cfd : *versions_->GetColumnFamilySet()) { + VersionEdit edit; + edit.SetColumnFamily(cfd->GetID()); + version_edits.insert({cfd->GetID(), edit}); + } + int job_id = next_job_id_.fetch_add(1); + { + auto stream = event_logger_.Log(); + stream << "job" << job_id << "event" + << "recovery_started"; + stream << "log_files"; + stream.StartArray(); + for (auto log_number : log_numbers) { + stream << log_number; + } + stream.EndArray(); + } + +#ifndef ROCKSDB_LITE + if (immutable_db_options_.wal_filter != nullptr) { + std::map cf_name_id_map; + std::map cf_lognumber_map; + for (auto cfd : *versions_->GetColumnFamilySet()) { + cf_name_id_map.insert(std::make_pair(cfd->GetName(), cfd->GetID())); + cf_lognumber_map.insert( + std::make_pair(cfd->GetID(), cfd->GetLogNumber())); + } + + immutable_db_options_.wal_filter->ColumnFamilyLogNumberMap(cf_lognumber_map, + cf_name_id_map); + } +#endif + + bool stop_replay_by_wal_filter = false; + bool stop_replay_for_corruption = false; + bool flushed = false; + uint64_t corrupted_log_number = kMaxSequenceNumber; + uint64_t min_log_number = MinLogNumberToKeep(); + for (auto log_number : log_numbers) { + if (log_number < min_log_number) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Skipping log #%" PRIu64 + " since it is older than min log to keep #%" PRIu64, + log_number, min_log_number); + continue; + } + // The previous incarnation may not have written any MANIFEST + // records after allocating this log number. So we manually + // update the file number allocation counter in VersionSet. + versions_->MarkFileNumberUsed(log_number); + // Open the log file + std::string fname = LogFileName(immutable_db_options_.wal_dir, log_number); + + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Recovering log #%" PRIu64 " mode %d", log_number, + static_cast(immutable_db_options_.wal_recovery_mode)); + auto logFileDropped = [this, &fname]() { + uint64_t bytes; + if (env_->GetFileSize(fname, &bytes).ok()) { + auto info_log = immutable_db_options_.info_log.get(); + ROCKS_LOG_WARN(info_log, "%s: dropping %d bytes", fname.c_str(), + static_cast(bytes)); + } + }; + if (stop_replay_by_wal_filter) { + logFileDropped(); + continue; + } + + std::unique_ptr file_reader; + { + std::unique_ptr file; + status = fs_->NewSequentialFile(fname, + fs_->OptimizeForLogRead(file_options_), + &file, nullptr); + if (!status.ok()) { + MaybeIgnoreError(&status); + if (!status.ok()) { + return status; + } else { + // Fail with one log file, but that's ok. + // Try next one. + continue; + } + } + file_reader.reset(new SequentialFileReader( + std::move(file), fname, immutable_db_options_.log_readahead_size)); + } + + // Create the log reader. + LogReporter reporter; + reporter.env = env_; + reporter.info_log = immutable_db_options_.info_log.get(); + reporter.fname = fname.c_str(); + if (!immutable_db_options_.paranoid_checks || + immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kSkipAnyCorruptedRecords) { + reporter.status = nullptr; + } else { + reporter.status = &status; + } + // We intentially make log::Reader do checksumming even if + // paranoid_checks==false so that corruptions cause entire commits + // to be skipped instead of propagating bad information (like overly + // large sequence numbers). + log::Reader reader(immutable_db_options_.info_log, std::move(file_reader), + &reporter, true /*checksum*/, log_number); + + // Determine if we should tolerate incomplete records at the tail end of the + // Read all the records and add to a memtable + std::string scratch; + Slice record; + WriteBatch batch; + + while (!stop_replay_by_wal_filter && + reader.ReadRecord(&record, &scratch, + immutable_db_options_.wal_recovery_mode) && + status.ok()) { + if (record.size() < WriteBatchInternal::kHeader) { + reporter.Corruption(record.size(), + Status::Corruption("log record too small")); + continue; + } + WriteBatchInternal::SetContents(&batch, record); + SequenceNumber sequence = WriteBatchInternal::Sequence(&batch); + + if (immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kPointInTimeRecovery) { + // In point-in-time recovery mode, if sequence id of log files are + // consecutive, we continue recovery despite corruption. This could + // happen when we open and write to a corrupted DB, where sequence id + // will start from the last sequence id we recovered. + if (sequence == *next_sequence) { + stop_replay_for_corruption = false; + } + if (stop_replay_for_corruption) { + logFileDropped(); + break; + } + } + +#ifndef ROCKSDB_LITE + if (immutable_db_options_.wal_filter != nullptr) { + WriteBatch new_batch; + bool batch_changed = false; + + WalFilter::WalProcessingOption wal_processing_option = + immutable_db_options_.wal_filter->LogRecordFound( + log_number, fname, batch, &new_batch, &batch_changed); + + switch (wal_processing_option) { + case WalFilter::WalProcessingOption::kContinueProcessing: + // do nothing, proceeed normally + break; + case WalFilter::WalProcessingOption::kIgnoreCurrentRecord: + // skip current record + continue; + case WalFilter::WalProcessingOption::kStopReplay: + // skip current record and stop replay + stop_replay_by_wal_filter = true; + continue; + case WalFilter::WalProcessingOption::kCorruptedRecord: { + status = + Status::Corruption("Corruption reported by Wal Filter ", + immutable_db_options_.wal_filter->Name()); + MaybeIgnoreError(&status); + if (!status.ok()) { + reporter.Corruption(record.size(), status); + continue; + } + break; + } + default: { + assert(false); // unhandled case + status = Status::NotSupported( + "Unknown WalProcessingOption returned" + " by Wal Filter ", + immutable_db_options_.wal_filter->Name()); + MaybeIgnoreError(&status); + if (!status.ok()) { + return status; + } else { + // Ignore the error with current record processing. + continue; + } + } + } + + if (batch_changed) { + // Make sure that the count in the new batch is + // within the orignal count. + int new_count = WriteBatchInternal::Count(&new_batch); + int original_count = WriteBatchInternal::Count(&batch); + if (new_count > original_count) { + ROCKS_LOG_FATAL( + immutable_db_options_.info_log, + "Recovering log #%" PRIu64 + " mode %d log filter %s returned " + "more records (%d) than original (%d) which is not allowed. " + "Aborting recovery.", + log_number, + static_cast(immutable_db_options_.wal_recovery_mode), + immutable_db_options_.wal_filter->Name(), new_count, + original_count); + status = Status::NotSupported( + "More than original # of records " + "returned by Wal Filter ", + immutable_db_options_.wal_filter->Name()); + return status; + } + // Set the same sequence number in the new_batch + // as the original batch. + WriteBatchInternal::SetSequence(&new_batch, + WriteBatchInternal::Sequence(&batch)); + batch = new_batch; + } + } +#endif // ROCKSDB_LITE + + // If column family was not found, it might mean that the WAL write + // batch references to the column family that was dropped after the + // insert. We don't want to fail the whole write batch in that case -- + // we just ignore the update. + // That's why we set ignore missing column families to true + bool has_valid_writes = false; + status = WriteBatchInternal::InsertInto( + &batch, column_family_memtables_.get(), &flush_scheduler_, + &trim_history_scheduler_, true, log_number, this, + false /* concurrent_memtable_writes */, next_sequence, + &has_valid_writes, seq_per_batch_, batch_per_txn_); + MaybeIgnoreError(&status); + if (!status.ok()) { + // We are treating this as a failure while reading since we read valid + // blocks that do not form coherent data + reporter.Corruption(record.size(), status); + continue; + } + + if (has_valid_writes && !read_only) { + // we can do this because this is called before client has access to the + // DB and there is only a single thread operating on DB + ColumnFamilyData* cfd; + + while ((cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) { + cfd->UnrefAndTryDelete(); + // If this asserts, it means that InsertInto failed in + // filtering updates to already-flushed column families + assert(cfd->GetLogNumber() <= log_number); + auto iter = version_edits.find(cfd->GetID()); + assert(iter != version_edits.end()); + VersionEdit* edit = &iter->second; + status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit); + if (!status.ok()) { + // Reflect errors immediately so that conditions like full + // file-systems cause the DB::Open() to fail. + return status; + } + flushed = true; + + cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(), + *next_sequence); + } + } + } + + if (!status.ok()) { + if (status.IsNotSupported()) { + // We should not treat NotSupported as corruption. It is rather a clear + // sign that we are processing a WAL that is produced by an incompatible + // version of the code. + return status; + } + if (immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kSkipAnyCorruptedRecords) { + // We should ignore all errors unconditionally + status = Status::OK(); + } else if (immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kPointInTimeRecovery) { + // We should ignore the error but not continue replaying + status = Status::OK(); + stop_replay_for_corruption = true; + corrupted_log_number = log_number; + if (corrupted_log_found != nullptr) { + *corrupted_log_found = true; + } + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Point in time recovered to log #%" PRIu64 + " seq #%" PRIu64, + log_number, *next_sequence); + } else { + assert(immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kTolerateCorruptedTailRecords || + immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kAbsoluteConsistency); + return status; + } + } + + flush_scheduler_.Clear(); + trim_history_scheduler_.Clear(); + auto last_sequence = *next_sequence - 1; + if ((*next_sequence != kMaxSequenceNumber) && + (versions_->LastSequence() <= last_sequence)) { + versions_->SetLastAllocatedSequence(last_sequence); + versions_->SetLastPublishedSequence(last_sequence); + versions_->SetLastSequence(last_sequence); + } + } + // Compare the corrupted log number to all columnfamily's current log number. + // Abort Open() if any column family's log number is greater than + // the corrupted log number, which means CF contains data beyond the point of + // corruption. This could during PIT recovery when the WAL is corrupted and + // some (but not all) CFs are flushed + // Exclude the PIT case where no log is dropped after the corruption point. + // This is to cover the case for empty logs after corrupted log, in which we + // don't reset stop_replay_for_corruption. + if (stop_replay_for_corruption == true && + (immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kPointInTimeRecovery || + immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kTolerateCorruptedTailRecords)) { + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->GetLogNumber() > corrupted_log_number) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Column family inconsistency: SST file contains data" + " beyond the point of corruption."); + return Status::Corruption("SST file is ahead of WALs"); + } + } + } + + // True if there's any data in the WALs; if not, we can skip re-processing + // them later + bool data_seen = false; + if (!read_only) { + // no need to refcount since client still doesn't have access + // to the DB and can not drop column families while we iterate + auto max_log_number = log_numbers.back(); + for (auto cfd : *versions_->GetColumnFamilySet()) { + auto iter = version_edits.find(cfd->GetID()); + assert(iter != version_edits.end()); + VersionEdit* edit = &iter->second; + + if (cfd->GetLogNumber() > max_log_number) { + // Column family cfd has already flushed the data + // from all logs. Memtable has to be empty because + // we filter the updates based on log_number + // (in WriteBatch::InsertInto) + assert(cfd->mem()->GetFirstSequenceNumber() == 0); + assert(edit->NumEntries() == 0); + continue; + } + + TEST_SYNC_POINT_CALLBACK( + "DBImpl::RecoverLogFiles:BeforeFlushFinalMemtable", /*arg=*/nullptr); + + // flush the final memtable (if non-empty) + if (cfd->mem()->GetFirstSequenceNumber() != 0) { + // If flush happened in the middle of recovery (e.g. due to memtable + // being full), we flush at the end. Otherwise we'll need to record + // where we were on last flush, which make the logic complicated. + if (flushed || !immutable_db_options_.avoid_flush_during_recovery) { + status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit); + if (!status.ok()) { + // Recovery failed + break; + } + flushed = true; + + cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(), + versions_->LastSequence()); + } + data_seen = true; + } + + // Update the log number info in the version edit corresponding to this + // column family. Note that the version edits will be written to MANIFEST + // together later. + // writing log_number in the manifest means that any log file + // with number strongly less than (log_number + 1) is already + // recovered and should be ignored on next reincarnation. + // Since we already recovered max_log_number, we want all logs + // with numbers `<= max_log_number` (includes this one) to be ignored + if (flushed || cfd->mem()->GetFirstSequenceNumber() == 0) { + edit->SetLogNumber(max_log_number + 1); + } + } + if (status.ok()) { + // we must mark the next log number as used, even though it's + // not actually used. that is because VersionSet assumes + // VersionSet::next_file_number_ always to be strictly greater than any + // log number + versions_->MarkFileNumberUsed(max_log_number + 1); + + autovector cfds; + autovector cf_opts; + autovector> edit_lists; + for (auto* cfd : *versions_->GetColumnFamilySet()) { + cfds.push_back(cfd); + cf_opts.push_back(cfd->GetLatestMutableCFOptions()); + auto iter = version_edits.find(cfd->GetID()); + assert(iter != version_edits.end()); + edit_lists.push_back({&iter->second}); + } + // write MANIFEST with update + status = versions_->LogAndApply(cfds, cf_opts, edit_lists, &mutex_, + directories_.GetDbDir(), + /*new_descriptor_log=*/true); + } + } + + if (status.ok() && data_seen && !flushed) { + status = RestoreAliveLogFiles(log_numbers); + } + + event_logger_.Log() << "job" << job_id << "event" + << "recovery_finished"; + + return status; +} + +Status DBImpl::RestoreAliveLogFiles(const std::vector& log_numbers) { + if (log_numbers.empty()) { + return Status::OK(); + } + Status s; + mutex_.AssertHeld(); + assert(immutable_db_options_.avoid_flush_during_recovery); + if (two_write_queues_) { + log_write_mutex_.Lock(); + } + // Mark these as alive so they'll be considered for deletion later by + // FindObsoleteFiles() + total_log_size_ = 0; + log_empty_ = false; + for (auto log_number : log_numbers) { + LogFileNumberSize log(log_number); + std::string fname = LogFileName(immutable_db_options_.wal_dir, log_number); + // This gets the appear size of the logs, not including preallocated space. + s = env_->GetFileSize(fname, &log.size); + if (!s.ok()) { + break; + } + total_log_size_ += log.size; + alive_log_files_.push_back(log); + // We preallocate space for logs, but then after a crash and restart, those + // preallocated space are not needed anymore. It is likely only the last + // log has such preallocated space, so we only truncate for the last log. + if (log_number == log_numbers.back()) { + std::unique_ptr last_log; + Status truncate_status = fs_->ReopenWritableFile( + fname, + fs_->OptimizeForLogWrite( + file_options_, + BuildDBOptions(immutable_db_options_, mutable_db_options_)), + &last_log, nullptr); + if (truncate_status.ok()) { + truncate_status = last_log->Truncate(log.size, IOOptions(), nullptr); + } + if (truncate_status.ok()) { + truncate_status = last_log->Close(IOOptions(), nullptr); + } + // Not a critical error if fail to truncate. + if (!truncate_status.ok()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Failed to truncate log #%" PRIu64 ": %s", log_number, + truncate_status.ToString().c_str()); + } + } + } + if (two_write_queues_) { + log_write_mutex_.Unlock(); + } + return s; +} + +Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, + MemTable* mem, VersionEdit* edit) { + mutex_.AssertHeld(); + const uint64_t start_micros = env_->NowMicros(); + FileMetaData meta; + std::unique_ptr::iterator> pending_outputs_inserted_elem( + new std::list::iterator( + CaptureCurrentFileNumberInPendingOutputs())); + meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0); + ReadOptions ro; + ro.total_order_seek = true; + Arena arena; + Status s; + TableProperties table_properties; + { + ScopedArenaIterator iter(mem->NewIterator(ro, &arena)); + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, + "[%s] [WriteLevel0TableForRecovery]" + " Level-0 table #%" PRIu64 ": started", + cfd->GetName().c_str(), meta.fd.GetNumber()); + + // Get the latest mutable cf options while the mutex is still locked + const MutableCFOptions mutable_cf_options = + *cfd->GetLatestMutableCFOptions(); + bool paranoid_file_checks = + cfd->GetLatestMutableCFOptions()->paranoid_file_checks; + + int64_t _current_time = 0; + env_->GetCurrentTime(&_current_time); // ignore error + const uint64_t current_time = static_cast(_current_time); + meta.oldest_ancester_time = current_time; + + { + auto write_hint = cfd->CalculateSSTWriteHint(0); + mutex_.Unlock(); + + SequenceNumber earliest_write_conflict_snapshot; + std::vector snapshot_seqs = + snapshots_.GetAll(&earliest_write_conflict_snapshot); + auto snapshot_checker = snapshot_checker_.get(); + if (use_custom_gc_ && snapshot_checker == nullptr) { + snapshot_checker = DisableGCSnapshotChecker::Instance(); + } + std::vector> + range_del_iters; + auto range_del_iter = + mem->NewRangeTombstoneIterator(ro, kMaxSequenceNumber); + if (range_del_iter != nullptr) { + range_del_iters.emplace_back(range_del_iter); + } + s = BuildTable( + dbname_, env_, fs_.get(), *cfd->ioptions(), mutable_cf_options, + file_options_for_compaction_, cfd->table_cache(), iter.get(), + std::move(range_del_iters), &meta, cfd->internal_comparator(), + cfd->int_tbl_prop_collector_factories(), cfd->GetID(), cfd->GetName(), + snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker, + GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), + mutable_cf_options.sample_for_compression, + cfd->ioptions()->compression_opts, paranoid_file_checks, + cfd->internal_stats(), TableFileCreationReason::kRecovery, + &event_logger_, job_id, Env::IO_HIGH, nullptr /* table_properties */, + -1 /* level */, current_time, write_hint); + LogFlush(immutable_db_options_.info_log); + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, + "[%s] [WriteLevel0TableForRecovery]" + " Level-0 table #%" PRIu64 ": %" PRIu64 " bytes %s", + cfd->GetName().c_str(), meta.fd.GetNumber(), + meta.fd.GetFileSize(), s.ToString().c_str()); + mutex_.Lock(); + } + } + ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); + + // Note that if file_size is zero, the file has been deleted and + // should not be added to the manifest. + int level = 0; + if (s.ok() && meta.fd.GetFileSize() > 0) { + edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(), + meta.fd.GetFileSize(), meta.smallest, meta.largest, + meta.fd.smallest_seqno, meta.fd.largest_seqno, + meta.marked_for_compaction, meta.oldest_blob_file_number, + meta.oldest_ancester_time, meta.file_creation_time, + meta.file_checksum, meta.file_checksum_func_name); + } + + InternalStats::CompactionStats stats(CompactionReason::kFlush, 1); + stats.micros = env_->NowMicros() - start_micros; + stats.bytes_written = meta.fd.GetFileSize(); + stats.num_output_files = 1; + cfd->internal_stats()->AddCompactionStats(level, Env::Priority::USER, stats); + cfd->internal_stats()->AddCFStats(InternalStats::BYTES_FLUSHED, + meta.fd.GetFileSize()); + RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize()); + return s; +} + +Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) { + DBOptions db_options(options); + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + if (db_options.persist_stats_to_disk) { + column_families.push_back( + ColumnFamilyDescriptor(kPersistentStatsColumnFamilyName, cf_options)); + } + std::vector handles; + Status s = DB::Open(db_options, dbname, column_families, &handles, dbptr); + if (s.ok()) { + if (db_options.persist_stats_to_disk) { + assert(handles.size() == 2); + } else { + assert(handles.size() == 1); + } + // i can delete the handle since DBImpl is always holding a reference to + // default column family + if (db_options.persist_stats_to_disk && handles[1] != nullptr) { + delete handles[1]; + } + delete handles[0]; + } + return s; +} + +Status DB::Open(const DBOptions& db_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, DB** dbptr) { + const bool kSeqPerBatch = true; + const bool kBatchPerTxn = true; + return DBImpl::Open(db_options, dbname, column_families, handles, dbptr, + !kSeqPerBatch, kBatchPerTxn); +} + +Status DBImpl::CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number, + size_t preallocate_block_size, log::Writer** new_log) { + Status s; + std::unique_ptr lfile; + + DBOptions db_options = + BuildDBOptions(immutable_db_options_, mutable_db_options_); + FileOptions opt_file_options = + fs_->OptimizeForLogWrite(file_options_, db_options); + std::string log_fname = + LogFileName(immutable_db_options_.wal_dir, log_file_num); + + if (recycle_log_number) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "reusing log %" PRIu64 " from recycle list\n", + recycle_log_number); + std::string old_log_fname = + LogFileName(immutable_db_options_.wal_dir, recycle_log_number); + TEST_SYNC_POINT("DBImpl::CreateWAL:BeforeReuseWritableFile1"); + TEST_SYNC_POINT("DBImpl::CreateWAL:BeforeReuseWritableFile2"); + s = fs_->ReuseWritableFile(log_fname, old_log_fname, opt_file_options, + &lfile, /*dbg=*/nullptr); + } else { + s = NewWritableFile(fs_.get(), log_fname, &lfile, opt_file_options); + } + + if (s.ok()) { + lfile->SetWriteLifeTimeHint(CalculateWALWriteHint()); + lfile->SetPreallocationBlockSize(preallocate_block_size); + + const auto& listeners = immutable_db_options_.listeners; + std::unique_ptr file_writer( + new WritableFileWriter(std::move(lfile), log_fname, opt_file_options, + env_, nullptr /* stats */, listeners)); + *new_log = new log::Writer(std::move(file_writer), log_file_num, + immutable_db_options_.recycle_log_file_num > 0, + immutable_db_options_.manual_wal_flush); + } + return s; +} + +Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, DB** dbptr, + const bool seq_per_batch, const bool batch_per_txn) { + Status s = SanitizeOptionsByTable(db_options, column_families); + if (!s.ok()) { + return s; + } + + s = ValidateOptions(db_options, column_families); + if (!s.ok()) { + return s; + } + + *dbptr = nullptr; + handles->clear(); + + size_t max_write_buffer_size = 0; + for (auto cf : column_families) { + max_write_buffer_size = + std::max(max_write_buffer_size, cf.options.write_buffer_size); + } + + DBImpl* impl = new DBImpl(db_options, dbname, seq_per_batch, batch_per_txn); + s = impl->env_->CreateDirIfMissing(impl->immutable_db_options_.wal_dir); + if (s.ok()) { + std::vector paths; + for (auto& db_path : impl->immutable_db_options_.db_paths) { + paths.emplace_back(db_path.path); + } + for (auto& cf : column_families) { + for (auto& cf_path : cf.options.cf_paths) { + paths.emplace_back(cf_path.path); + } + } + for (auto& path : paths) { + s = impl->env_->CreateDirIfMissing(path); + if (!s.ok()) { + break; + } + } + + // For recovery from NoSpace() error, we can only handle + // the case where the database is stored in a single path + if (paths.size() <= 1) { + impl->error_handler_.EnableAutoRecovery(); + } + } + + if (!s.ok()) { + delete impl; + return s; + } + + s = impl->CreateArchivalDirectory(); + if (!s.ok()) { + delete impl; + return s; + } + + impl->wal_in_db_path_ = IsWalDirSameAsDBPath(&impl->immutable_db_options_); + + impl->mutex_.Lock(); + // Handles create_if_missing, error_if_exists + uint64_t recovered_seq(kMaxSequenceNumber); + s = impl->Recover(column_families, false, false, false, &recovered_seq); + if (s.ok()) { + uint64_t new_log_number = impl->versions_->NewFileNumber(); + log::Writer* new_log = nullptr; + const size_t preallocate_block_size = + impl->GetWalPreallocateBlockSize(max_write_buffer_size); + s = impl->CreateWAL(new_log_number, 0 /*recycle_log_number*/, + preallocate_block_size, &new_log); + if (s.ok()) { + InstrumentedMutexLock wl(&impl->log_write_mutex_); + impl->logfile_number_ = new_log_number; + assert(new_log != nullptr); + impl->logs_.emplace_back(new_log_number, new_log); + } + + if (s.ok()) { + // set column family handles + for (auto cf : column_families) { + auto cfd = + impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name); + if (cfd != nullptr) { + handles->push_back( + new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_)); + impl->NewThreadStatusCfInfo(cfd); + } else { + if (db_options.create_missing_column_families) { + // missing column family, create it + ColumnFamilyHandle* handle; + impl->mutex_.Unlock(); + s = impl->CreateColumnFamily(cf.options, cf.name, &handle); + impl->mutex_.Lock(); + if (s.ok()) { + handles->push_back(handle); + } else { + break; + } + } else { + s = Status::InvalidArgument("Column family not found: ", cf.name); + break; + } + } + } + } + if (s.ok()) { + SuperVersionContext sv_context(/* create_superversion */ true); + for (auto cfd : *impl->versions_->GetColumnFamilySet()) { + impl->InstallSuperVersionAndScheduleWork( + cfd, &sv_context, *cfd->GetLatestMutableCFOptions()); + } + sv_context.Clean(); + if (impl->two_write_queues_) { + impl->log_write_mutex_.Lock(); + } + impl->alive_log_files_.push_back( + DBImpl::LogFileNumberSize(impl->logfile_number_)); + if (impl->two_write_queues_) { + impl->log_write_mutex_.Unlock(); + } + + impl->DeleteObsoleteFiles(); + s = impl->directories_.GetDbDir()->Fsync(); + } + if (s.ok()) { + // In WritePrepared there could be gap in sequence numbers. This breaks + // the trick we use in kPointInTimeRecovery which assumes the first seq in + // the log right after the corrupted log is one larger than the last seq + // we read from the logs. To let this trick keep working, we add a dummy + // entry with the expected sequence to the first log right after recovery. + // In non-WritePrepared case also the new log after recovery could be + // empty, and thus missing the consecutive seq hint to distinguish + // middle-log corruption to corrupted-log-remained-after-recovery. This + // case also will be addressed by a dummy write. + if (recovered_seq != kMaxSequenceNumber) { + WriteBatch empty_batch; + WriteBatchInternal::SetSequence(&empty_batch, recovered_seq); + WriteOptions write_options; + uint64_t log_used, log_size; + log::Writer* log_writer = impl->logs_.back().writer; + s = impl->WriteToWAL(empty_batch, log_writer, &log_used, &log_size); + if (s.ok()) { + // Need to fsync, otherwise it might get lost after a power reset. + s = impl->FlushWAL(false); + if (s.ok()) { + s = log_writer->file()->Sync(impl->immutable_db_options_.use_fsync); + } + } + } + } + } + if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) { + // try to read format version but no need to fail Open() even if it fails + s = impl->PersistentStatsProcessFormatVersion(); + } + + if (s.ok()) { + for (auto cfd : *impl->versions_->GetColumnFamilySet()) { + if (cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { + auto* vstorage = cfd->current()->storage_info(); + for (int i = 1; i < vstorage->num_levels(); ++i) { + int num_files = vstorage->NumLevelFiles(i); + if (num_files > 0) { + s = Status::InvalidArgument( + "Not all files are at level 0. Cannot " + "open with FIFO compaction style."); + break; + } + } + } + if (!cfd->mem()->IsSnapshotSupported()) { + impl->is_snapshot_supported_ = false; + } + if (cfd->ioptions()->merge_operator != nullptr && + !cfd->mem()->IsMergeOperatorSupported()) { + s = Status::InvalidArgument( + "The memtable of column family %s does not support merge operator " + "its options.merge_operator is non-null", + cfd->GetName().c_str()); + } + if (!s.ok()) { + break; + } + } + } + TEST_SYNC_POINT("DBImpl::Open:Opened"); + Status persist_options_status; + if (s.ok()) { + // Persist RocksDB Options before scheduling the compaction. + // The WriteOptionsFile() will release and lock the mutex internally. + persist_options_status = impl->WriteOptionsFile( + false /*need_mutex_lock*/, false /*need_enter_write_thread*/); + + *dbptr = impl; + impl->opened_successfully_ = true; + impl->MaybeScheduleFlushOrCompaction(); + } + impl->mutex_.Unlock(); + +#ifndef ROCKSDB_LITE + auto sfm = static_cast( + impl->immutable_db_options_.sst_file_manager.get()); + if (s.ok() && sfm) { + // Notify SstFileManager about all sst files that already exist in + // db_paths[0] and cf_paths[0] when the DB is opened. + + // SstFileManagerImpl needs to know sizes of the files. For files whose size + // we already know (sst files that appear in manifest - typically that's the + // vast majority of all files), we'll pass the size to SstFileManager. + // For all other files SstFileManager will query the size from filesystem. + + std::vector metadata; + + impl->mutex_.Lock(); + impl->versions_->GetLiveFilesMetaData(&metadata); + impl->mutex_.Unlock(); + + std::unordered_map known_file_sizes; + for (const auto& md : metadata) { + std::string name = md.name; + if (!name.empty() && name[0] == '/') { + name = name.substr(1); + } + known_file_sizes[name] = md.size; + } + + std::vector paths; + paths.emplace_back(impl->immutable_db_options_.db_paths[0].path); + for (auto& cf : column_families) { + if (!cf.options.cf_paths.empty()) { + paths.emplace_back(cf.options.cf_paths[0].path); + } + } + // Remove duplicate paths. + std::sort(paths.begin(), paths.end()); + paths.erase(std::unique(paths.begin(), paths.end()), paths.end()); + for (auto& path : paths) { + std::vector existing_files; + impl->immutable_db_options_.env->GetChildren(path, &existing_files); + for (auto& file_name : existing_files) { + uint64_t file_number; + FileType file_type; + std::string file_path = path + "/" + file_name; + if (ParseFileName(file_name, &file_number, &file_type) && + file_type == kTableFile) { + if (known_file_sizes.count(file_name)) { + // We're assuming that each sst file name exists in at most one of + // the paths. + sfm->OnAddFile(file_path, known_file_sizes.at(file_name), + /* compaction */ false); + } else { + sfm->OnAddFile(file_path); + } + } + } + } + + // Reserve some disk buffer space. This is a heuristic - when we run out + // of disk space, this ensures that there is atleast write_buffer_size + // amount of free space before we resume DB writes. In low disk space + // conditions, we want to avoid a lot of small L0 files due to frequent + // WAL write failures and resultant forced flushes + sfm->ReserveDiskBuffer(max_write_buffer_size, + impl->immutable_db_options_.db_paths[0].path); + } +#endif // !ROCKSDB_LITE + + if (s.ok()) { + ROCKS_LOG_HEADER(impl->immutable_db_options_.info_log, "DB pointer %p", + impl); + LogFlush(impl->immutable_db_options_.info_log); + assert(impl->TEST_WALBufferIsEmpty()); + // If the assert above fails then we need to FlushWAL before returning + // control back to the user. + if (!persist_options_status.ok()) { + s = Status::IOError( + "DB::Open() failed --- Unable to persist Options file", + persist_options_status.ToString()); + } + } + if (s.ok()) { + impl->StartTimedTasks(); + } + if (!s.ok()) { + for (auto* h : *handles) { + delete h; + } + handles->clear(); + delete impl; + *dbptr = nullptr; + } + return s; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_impl/db_impl_readonly.cc b/src/rocksdb/db/db_impl/db_impl_readonly.cc new file mode 100644 index 000000000..a4242bfe1 --- /dev/null +++ b/src/rocksdb/db/db_impl/db_impl_readonly.cc @@ -0,0 +1,221 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/db_impl/db_impl_readonly.h" +#include "db/arena_wrapped_db_iter.h" + +#include "db/compacted_db_impl.h" +#include "db/db_impl/db_impl.h" +#include "db/db_iter.h" +#include "db/merge_context.h" +#include "monitoring/perf_context_imp.h" + +namespace ROCKSDB_NAMESPACE { + +#ifndef ROCKSDB_LITE + +DBImplReadOnly::DBImplReadOnly(const DBOptions& db_options, + const std::string& dbname) + : DBImpl(db_options, dbname) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Opening the db in read only mode"); + LogFlush(immutable_db_options_.info_log); +} + +DBImplReadOnly::~DBImplReadOnly() {} + +// Implementations of the DB interface +Status DBImplReadOnly::Get(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* pinnable_val) { + assert(pinnable_val != nullptr); + // TODO: stopwatch DB_GET needed?, perf timer needed? + PERF_TIMER_GUARD(get_snapshot_time); + Status s; + SequenceNumber snapshot = versions_->LastSequence(); + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + tracer_->Get(column_family, key); + } + } + SuperVersion* super_version = cfd->GetSuperVersion(); + MergeContext merge_context; + SequenceNumber max_covering_tombstone_seq = 0; + LookupKey lkey(key, snapshot); + PERF_TIMER_STOP(get_snapshot_time); + if (super_version->mem->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context, + &max_covering_tombstone_seq, read_options)) { + pinnable_val->PinSelf(); + RecordTick(stats_, MEMTABLE_HIT); + } else { + PERF_TIMER_GUARD(get_from_output_files_time); + super_version->current->Get(read_options, lkey, pinnable_val, &s, + &merge_context, &max_covering_tombstone_seq); + RecordTick(stats_, MEMTABLE_MISS); + } + RecordTick(stats_, NUMBER_KEYS_READ); + size_t size = pinnable_val->size(); + RecordTick(stats_, BYTES_READ, size); + RecordInHistogram(stats_, BYTES_PER_READ, size); + PERF_COUNTER_ADD(get_read_bytes, size); + return s; +} + +Iterator* DBImplReadOnly::NewIterator(const ReadOptions& read_options, + ColumnFamilyHandle* column_family) { + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + SuperVersion* super_version = cfd->GetSuperVersion()->Ref(); + SequenceNumber latest_snapshot = versions_->LastSequence(); + SequenceNumber read_seq = + read_options.snapshot != nullptr + ? reinterpret_cast(read_options.snapshot) + ->number_ + : latest_snapshot; + ReadCallback* read_callback = nullptr; // No read callback provided. + auto db_iter = NewArenaWrappedDbIterator( + env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options, + read_seq, + super_version->mutable_cf_options.max_sequential_skip_in_iterations, + super_version->version_number, read_callback); + auto internal_iter = + NewInternalIterator(read_options, cfd, super_version, db_iter->GetArena(), + db_iter->GetRangeDelAggregator(), read_seq); + db_iter->SetIterUnderDBIter(internal_iter); + return db_iter; +} + +Status DBImplReadOnly::NewIterators( + const ReadOptions& read_options, + const std::vector& column_families, + std::vector* iterators) { + ReadCallback* read_callback = nullptr; // No read callback provided. + if (iterators == nullptr) { + return Status::InvalidArgument("iterators not allowed to be nullptr"); + } + iterators->clear(); + iterators->reserve(column_families.size()); + SequenceNumber latest_snapshot = versions_->LastSequence(); + SequenceNumber read_seq = + read_options.snapshot != nullptr + ? reinterpret_cast(read_options.snapshot) + ->number_ + : latest_snapshot; + + for (auto cfh : column_families) { + auto* cfd = reinterpret_cast(cfh)->cfd(); + auto* sv = cfd->GetSuperVersion()->Ref(); + auto* db_iter = NewArenaWrappedDbIterator( + env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, read_seq, + sv->mutable_cf_options.max_sequential_skip_in_iterations, + sv->version_number, read_callback); + auto* internal_iter = + NewInternalIterator(read_options, cfd, sv, db_iter->GetArena(), + db_iter->GetRangeDelAggregator(), read_seq); + db_iter->SetIterUnderDBIter(internal_iter); + iterators->push_back(db_iter); + } + + return Status::OK(); +} + +Status DB::OpenForReadOnly(const Options& options, const std::string& dbname, + DB** dbptr, bool /*error_if_log_file_exist*/) { + *dbptr = nullptr; + + // Try to first open DB as fully compacted DB + Status s; + s = CompactedDBImpl::Open(options, dbname, dbptr); + if (s.ok()) { + return s; + } + + DBOptions db_options(options); + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + std::vector handles; + + s = DB::OpenForReadOnly(db_options, dbname, column_families, &handles, dbptr); + if (s.ok()) { + assert(handles.size() == 1); + // i can delete the handle since DBImpl is always holding a + // reference to default column family + delete handles[0]; + } + return s; +} + +Status DB::OpenForReadOnly( + const DBOptions& db_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, DB** dbptr, + bool error_if_log_file_exist) { + *dbptr = nullptr; + handles->clear(); + + SuperVersionContext sv_context(/* create_superversion */ true); + DBImplReadOnly* impl = new DBImplReadOnly(db_options, dbname); + impl->mutex_.Lock(); + Status s = impl->Recover(column_families, true /* read only */, + error_if_log_file_exist); + if (s.ok()) { + // set column family handles + for (auto cf : column_families) { + auto cfd = + impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name); + if (cfd == nullptr) { + s = Status::InvalidArgument("Column family not found: ", cf.name); + break; + } + handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_)); + } + } + if (s.ok()) { + for (auto cfd : *impl->versions_->GetColumnFamilySet()) { + sv_context.NewSuperVersion(); + cfd->InstallSuperVersion(&sv_context, &impl->mutex_); + } + } + impl->mutex_.Unlock(); + sv_context.Clean(); + if (s.ok()) { + *dbptr = impl; + for (auto* h : *handles) { + impl->NewThreadStatusCfInfo( + reinterpret_cast(h)->cfd()); + } + } else { + for (auto h : *handles) { + delete h; + } + handles->clear(); + delete impl; + } + return s; +} + +#else // !ROCKSDB_LITE + +Status DB::OpenForReadOnly(const Options& /*options*/, + const std::string& /*dbname*/, DB** /*dbptr*/, + bool /*error_if_log_file_exist*/) { + return Status::NotSupported("Not supported in ROCKSDB_LITE."); +} + +Status DB::OpenForReadOnly( + const DBOptions& /*db_options*/, const std::string& /*dbname*/, + const std::vector& /*column_families*/, + std::vector* /*handles*/, DB** /*dbptr*/, + bool /*error_if_log_file_exist*/) { + return Status::NotSupported("Not supported in ROCKSDB_LITE."); +} +#endif // !ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_impl/db_impl_readonly.h b/src/rocksdb/db/db_impl/db_impl_readonly.h new file mode 100644 index 000000000..04d06b4a1 --- /dev/null +++ b/src/rocksdb/db/db_impl/db_impl_readonly.h @@ -0,0 +1,137 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include +#include "db/db_impl/db_impl.h" + +namespace ROCKSDB_NAMESPACE { + +class DBImplReadOnly : public DBImpl { + public: + DBImplReadOnly(const DBOptions& options, const std::string& dbname); + // No copying allowed + DBImplReadOnly(const DBImplReadOnly&) = delete; + void operator=(const DBImplReadOnly&) = delete; + + virtual ~DBImplReadOnly(); + + // Implementations of the DB interface + using DB::Get; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) override; + + // TODO: Implement ReadOnly MultiGet? + + using DBImpl::NewIterator; + virtual Iterator* NewIterator(const ReadOptions&, + ColumnFamilyHandle* column_family) override; + + virtual Status NewIterators( + const ReadOptions& options, + const std::vector& column_families, + std::vector* iterators) override; + + using DBImpl::Put; + virtual Status Put(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/, const Slice& /*value*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + using DBImpl::Merge; + virtual Status Merge(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/, const Slice& /*value*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + using DBImpl::Delete; + virtual Status Delete(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + using DBImpl::SingleDelete; + virtual Status SingleDelete(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + virtual Status Write(const WriteOptions& /*options*/, + WriteBatch* /*updates*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + using DBImpl::CompactRange; + virtual Status CompactRange(const CompactRangeOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice* /*begin*/, + const Slice* /*end*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + using DBImpl::CompactFiles; + virtual Status CompactFiles( + const CompactionOptions& /*compact_options*/, + ColumnFamilyHandle* /*column_family*/, + const std::vector& /*input_file_names*/, + const int /*output_level*/, const int /*output_path_id*/ = -1, + std::vector* const /*output_file_names*/ = nullptr, + CompactionJobInfo* /*compaction_job_info*/ = nullptr) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + virtual Status DisableFileDeletions() override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + virtual Status EnableFileDeletions(bool /*force*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + virtual Status GetLiveFiles(std::vector& ret, + uint64_t* manifest_file_size, + bool /*flush_memtable*/) override { + return DBImpl::GetLiveFiles(ret, manifest_file_size, + false /* flush_memtable */); + } + + using DBImpl::Flush; + virtual Status Flush(const FlushOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + using DBImpl::SyncWAL; + virtual Status SyncWAL() override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + using DB::IngestExternalFile; + virtual Status IngestExternalFile( + ColumnFamilyHandle* /*column_family*/, + const std::vector& /*external_files*/, + const IngestExternalFileOptions& /*ingestion_options*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + using DB::CreateColumnFamilyWithImport; + virtual Status CreateColumnFamilyWithImport( + const ColumnFamilyOptions& /*options*/, + const std::string& /*column_family_name*/, + const ImportColumnFamilyOptions& /*import_options*/, + const ExportImportFilesMetaData& /*metadata*/, + ColumnFamilyHandle** /*handle*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + private: + friend class DB; +}; +} // namespace ROCKSDB_NAMESPACE + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/db_impl/db_impl_secondary.cc b/src/rocksdb/db/db_impl/db_impl_secondary.cc new file mode 100644 index 000000000..f0ec27c32 --- /dev/null +++ b/src/rocksdb/db/db_impl/db_impl_secondary.cc @@ -0,0 +1,671 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/db_impl/db_impl_secondary.h" + +#include + +#include "db/arena_wrapped_db_iter.h" +#include "db/merge_context.h" +#include "logging/auto_roll_logger.h" +#include "monitoring/perf_context_imp.h" +#include "util/cast_util.h" + +namespace ROCKSDB_NAMESPACE { + +#ifndef ROCKSDB_LITE +DBImplSecondary::DBImplSecondary(const DBOptions& db_options, + const std::string& dbname) + : DBImpl(db_options, dbname) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Opening the db in secondary mode"); + LogFlush(immutable_db_options_.info_log); +} + +DBImplSecondary::~DBImplSecondary() {} + +Status DBImplSecondary::Recover( + const std::vector& column_families, + bool /*readonly*/, bool /*error_if_log_file_exist*/, + bool /*error_if_data_exists_in_logs*/, uint64_t*) { + mutex_.AssertHeld(); + + JobContext job_context(0); + Status s; + s = static_cast(versions_.get()) + ->Recover(column_families, &manifest_reader_, &manifest_reporter_, + &manifest_reader_status_); + if (!s.ok()) { + return s; + } + if (immutable_db_options_.paranoid_checks && s.ok()) { + s = CheckConsistency(); + } + // Initial max_total_in_memory_state_ before recovery logs. + max_total_in_memory_state_ = 0; + for (auto cfd : *versions_->GetColumnFamilySet()) { + auto* mutable_cf_options = cfd->GetLatestMutableCFOptions(); + max_total_in_memory_state_ += mutable_cf_options->write_buffer_size * + mutable_cf_options->max_write_buffer_number; + } + if (s.ok()) { + default_cf_handle_ = new ColumnFamilyHandleImpl( + versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_); + default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats(); + single_column_family_mode_ = + versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1; + + std::unordered_set cfds_changed; + s = FindAndRecoverLogFiles(&cfds_changed, &job_context); + } + + if (s.IsPathNotFound()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Secondary tries to read WAL, but WAL file(s) have already " + "been purged by primary."); + s = Status::OK(); + } + // TODO: update options_file_number_ needed? + + job_context.Clean(); + return s; +} + +// find new WAL and apply them in order to the secondary instance +Status DBImplSecondary::FindAndRecoverLogFiles( + std::unordered_set* cfds_changed, + JobContext* job_context) { + assert(nullptr != cfds_changed); + assert(nullptr != job_context); + Status s; + std::vector logs; + s = FindNewLogNumbers(&logs); + if (s.ok() && !logs.empty()) { + SequenceNumber next_sequence(kMaxSequenceNumber); + s = RecoverLogFiles(logs, &next_sequence, cfds_changed, job_context); + } + return s; +} + +// List wal_dir and find all new WALs, return these log numbers +Status DBImplSecondary::FindNewLogNumbers(std::vector* logs) { + assert(logs != nullptr); + std::vector filenames; + Status s; + s = env_->GetChildren(immutable_db_options_.wal_dir, &filenames); + if (s.IsNotFound()) { + return Status::InvalidArgument("Failed to open wal_dir", + immutable_db_options_.wal_dir); + } else if (!s.ok()) { + return s; + } + + // if log_readers_ is non-empty, it means we have applied all logs with log + // numbers smaller than the smallest log in log_readers_, so there is no + // need to pass these logs to RecoverLogFiles + uint64_t log_number_min = 0; + if (!log_readers_.empty()) { + log_number_min = log_readers_.begin()->first; + } + for (size_t i = 0; i < filenames.size(); i++) { + uint64_t number; + FileType type; + if (ParseFileName(filenames[i], &number, &type) && type == kLogFile && + number >= log_number_min) { + logs->push_back(number); + } + } + // Recover logs in the order that they were generated + if (!logs->empty()) { + std::sort(logs->begin(), logs->end()); + } + return s; +} + +Status DBImplSecondary::MaybeInitLogReader( + uint64_t log_number, log::FragmentBufferedReader** log_reader) { + auto iter = log_readers_.find(log_number); + // make sure the log file is still present + if (iter == log_readers_.end() || + iter->second->reader_->GetLogNumber() != log_number) { + // delete the obsolete log reader if log number mismatch + if (iter != log_readers_.end()) { + log_readers_.erase(iter); + } + // initialize log reader from log_number + // TODO: min_log_number_to_keep_2pc check needed? + // Open the log file + std::string fname = LogFileName(immutable_db_options_.wal_dir, log_number); + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Recovering log #%" PRIu64 " mode %d", log_number, + static_cast(immutable_db_options_.wal_recovery_mode)); + + std::unique_ptr file_reader; + { + std::unique_ptr file; + Status status = fs_->NewSequentialFile( + fname, fs_->OptimizeForLogRead(file_options_), &file, + nullptr); + if (!status.ok()) { + *log_reader = nullptr; + return status; + } + file_reader.reset(new SequentialFileReader( + std::move(file), fname, immutable_db_options_.log_readahead_size)); + } + + // Create the log reader. + LogReaderContainer* log_reader_container = new LogReaderContainer( + env_, immutable_db_options_.info_log, std::move(fname), + std::move(file_reader), log_number); + log_readers_.insert(std::make_pair( + log_number, std::unique_ptr(log_reader_container))); + } + iter = log_readers_.find(log_number); + assert(iter != log_readers_.end()); + *log_reader = iter->second->reader_; + return Status::OK(); +} + +// After manifest recovery, replay WALs and refresh log_readers_ if necessary +// REQUIRES: log_numbers are sorted in ascending order +Status DBImplSecondary::RecoverLogFiles( + const std::vector& log_numbers, SequenceNumber* next_sequence, + std::unordered_set* cfds_changed, + JobContext* job_context) { + assert(nullptr != cfds_changed); + assert(nullptr != job_context); + mutex_.AssertHeld(); + Status status; + for (auto log_number : log_numbers) { + log::FragmentBufferedReader* reader = nullptr; + status = MaybeInitLogReader(log_number, &reader); + if (!status.ok()) { + return status; + } + assert(reader != nullptr); + } + for (auto log_number : log_numbers) { + auto it = log_readers_.find(log_number); + assert(it != log_readers_.end()); + log::FragmentBufferedReader* reader = it->second->reader_; + // Manually update the file number allocation counter in VersionSet. + versions_->MarkFileNumberUsed(log_number); + + // Determine if we should tolerate incomplete records at the tail end of the + // Read all the records and add to a memtable + std::string scratch; + Slice record; + WriteBatch batch; + + while (reader->ReadRecord(&record, &scratch, + immutable_db_options_.wal_recovery_mode) && + status.ok()) { + if (record.size() < WriteBatchInternal::kHeader) { + reader->GetReporter()->Corruption( + record.size(), Status::Corruption("log record too small")); + continue; + } + WriteBatchInternal::SetContents(&batch, record); + SequenceNumber seq_of_batch = WriteBatchInternal::Sequence(&batch); + std::vector column_family_ids; + status = CollectColumnFamilyIdsFromWriteBatch(batch, &column_family_ids); + if (status.ok()) { + for (const auto id : column_family_ids) { + ColumnFamilyData* cfd = + versions_->GetColumnFamilySet()->GetColumnFamily(id); + if (cfd == nullptr) { + continue; + } + if (cfds_changed->count(cfd) == 0) { + cfds_changed->insert(cfd); + } + const std::vector& l0_files = + cfd->current()->storage_info()->LevelFiles(0); + SequenceNumber seq = + l0_files.empty() ? 0 : l0_files.back()->fd.largest_seqno; + // If the write batch's sequence number is smaller than the last + // sequence number of the largest sequence persisted for this column + // family, then its data must reside in an SST that has already been + // added in the prior MANIFEST replay. + if (seq_of_batch <= seq) { + continue; + } + auto curr_log_num = port::kMaxUint64; + if (cfd_to_current_log_.count(cfd) > 0) { + curr_log_num = cfd_to_current_log_[cfd]; + } + // If the active memtable contains records added by replaying an + // earlier WAL, then we need to seal the memtable, add it to the + // immutable memtable list and create a new active memtable. + if (!cfd->mem()->IsEmpty() && (curr_log_num == port::kMaxUint64 || + curr_log_num != log_number)) { + const MutableCFOptions mutable_cf_options = + *cfd->GetLatestMutableCFOptions(); + MemTable* new_mem = + cfd->ConstructNewMemtable(mutable_cf_options, seq_of_batch); + cfd->mem()->SetNextLogNumber(log_number); + cfd->imm()->Add(cfd->mem(), &job_context->memtables_to_free); + new_mem->Ref(); + cfd->SetMemtable(new_mem); + } + } + bool has_valid_writes = false; + status = WriteBatchInternal::InsertInto( + &batch, column_family_memtables_.get(), + nullptr /* flush_scheduler */, nullptr /* trim_history_scheduler*/, + true, log_number, this, false /* concurrent_memtable_writes */, + next_sequence, &has_valid_writes, seq_per_batch_, batch_per_txn_); + } + // If column family was not found, it might mean that the WAL write + // batch references to the column family that was dropped after the + // insert. We don't want to fail the whole write batch in that case -- + // we just ignore the update. + // That's why we set ignore missing column families to true + // passing null flush_scheduler will disable memtable flushing which is + // needed for secondary instances + if (status.ok()) { + for (const auto id : column_family_ids) { + ColumnFamilyData* cfd = + versions_->GetColumnFamilySet()->GetColumnFamily(id); + if (cfd == nullptr) { + continue; + } + std::unordered_map::iterator iter = + cfd_to_current_log_.find(cfd); + if (iter == cfd_to_current_log_.end()) { + cfd_to_current_log_.insert({cfd, log_number}); + } else if (log_number > iter->second) { + iter->second = log_number; + } + } + auto last_sequence = *next_sequence - 1; + if ((*next_sequence != kMaxSequenceNumber) && + (versions_->LastSequence() <= last_sequence)) { + versions_->SetLastAllocatedSequence(last_sequence); + versions_->SetLastPublishedSequence(last_sequence); + versions_->SetLastSequence(last_sequence); + } + } else { + // We are treating this as a failure while reading since we read valid + // blocks that do not form coherent data + reader->GetReporter()->Corruption(record.size(), status); + } + } + if (!status.ok()) { + return status; + } + } + // remove logreaders from map after successfully recovering the WAL + if (log_readers_.size() > 1) { + auto erase_iter = log_readers_.begin(); + std::advance(erase_iter, log_readers_.size() - 1); + log_readers_.erase(log_readers_.begin(), erase_iter); + } + return status; +} + +// Implementation of the DB interface +Status DBImplSecondary::Get(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) { + return GetImpl(read_options, column_family, key, value); +} + +Status DBImplSecondary::GetImpl(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* pinnable_val) { + assert(pinnable_val != nullptr); + PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_); + StopWatch sw(env_, stats_, DB_GET); + PERF_TIMER_GUARD(get_snapshot_time); + + auto cfh = static_cast(column_family); + ColumnFamilyData* cfd = cfh->cfd(); + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + tracer_->Get(column_family, key); + } + } + // Acquire SuperVersion + SuperVersion* super_version = GetAndRefSuperVersion(cfd); + SequenceNumber snapshot = versions_->LastSequence(); + MergeContext merge_context; + SequenceNumber max_covering_tombstone_seq = 0; + Status s; + LookupKey lkey(key, snapshot); + PERF_TIMER_STOP(get_snapshot_time); + + bool done = false; + if (super_version->mem->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context, + &max_covering_tombstone_seq, read_options)) { + done = true; + pinnable_val->PinSelf(); + RecordTick(stats_, MEMTABLE_HIT); + } else if ((s.ok() || s.IsMergeInProgress()) && + super_version->imm->Get( + lkey, pinnable_val->GetSelf(), &s, &merge_context, + &max_covering_tombstone_seq, read_options)) { + done = true; + pinnable_val->PinSelf(); + RecordTick(stats_, MEMTABLE_HIT); + } + if (!done && !s.ok() && !s.IsMergeInProgress()) { + ReturnAndCleanupSuperVersion(cfd, super_version); + return s; + } + if (!done) { + PERF_TIMER_GUARD(get_from_output_files_time); + super_version->current->Get(read_options, lkey, pinnable_val, &s, + &merge_context, &max_covering_tombstone_seq); + RecordTick(stats_, MEMTABLE_MISS); + } + { + PERF_TIMER_GUARD(get_post_process_time); + ReturnAndCleanupSuperVersion(cfd, super_version); + RecordTick(stats_, NUMBER_KEYS_READ); + size_t size = pinnable_val->size(); + RecordTick(stats_, BYTES_READ, size); + RecordTimeToHistogram(stats_, BYTES_PER_READ, size); + PERF_COUNTER_ADD(get_read_bytes, size); + } + return s; +} + +Iterator* DBImplSecondary::NewIterator(const ReadOptions& read_options, + ColumnFamilyHandle* column_family) { + if (read_options.managed) { + return NewErrorIterator( + Status::NotSupported("Managed iterator is not supported anymore.")); + } + if (read_options.read_tier == kPersistedTier) { + return NewErrorIterator(Status::NotSupported( + "ReadTier::kPersistedData is not yet supported in iterators.")); + } + Iterator* result = nullptr; + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + ReadCallback* read_callback = nullptr; // No read callback provided. + if (read_options.tailing) { + return NewErrorIterator(Status::NotSupported( + "tailing iterator not supported in secondary mode")); + } else if (read_options.snapshot != nullptr) { + // TODO (yanqin) support snapshot. + return NewErrorIterator( + Status::NotSupported("snapshot not supported in secondary mode")); + } else { + auto snapshot = versions_->LastSequence(); + result = NewIteratorImpl(read_options, cfd, snapshot, read_callback); + } + return result; +} + +ArenaWrappedDBIter* DBImplSecondary::NewIteratorImpl( + const ReadOptions& read_options, ColumnFamilyData* cfd, + SequenceNumber snapshot, ReadCallback* read_callback) { + assert(nullptr != cfd); + SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); + auto db_iter = NewArenaWrappedDbIterator( + env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options, + snapshot, + super_version->mutable_cf_options.max_sequential_skip_in_iterations, + super_version->version_number, read_callback); + auto internal_iter = + NewInternalIterator(read_options, cfd, super_version, db_iter->GetArena(), + db_iter->GetRangeDelAggregator(), snapshot); + db_iter->SetIterUnderDBIter(internal_iter); + return db_iter; +} + +Status DBImplSecondary::NewIterators( + const ReadOptions& read_options, + const std::vector& column_families, + std::vector* iterators) { + if (read_options.managed) { + return Status::NotSupported("Managed iterator is not supported anymore."); + } + if (read_options.read_tier == kPersistedTier) { + return Status::NotSupported( + "ReadTier::kPersistedData is not yet supported in iterators."); + } + ReadCallback* read_callback = nullptr; // No read callback provided. + if (iterators == nullptr) { + return Status::InvalidArgument("iterators not allowed to be nullptr"); + } + iterators->clear(); + iterators->reserve(column_families.size()); + if (read_options.tailing) { + return Status::NotSupported( + "tailing iterator not supported in secondary mode"); + } else if (read_options.snapshot != nullptr) { + // TODO (yanqin) support snapshot. + return Status::NotSupported("snapshot not supported in secondary mode"); + } else { + SequenceNumber read_seq = versions_->LastSequence(); + for (auto cfh : column_families) { + ColumnFamilyData* cfd = static_cast(cfh)->cfd(); + iterators->push_back( + NewIteratorImpl(read_options, cfd, read_seq, read_callback)); + } + } + return Status::OK(); +} + +Status DBImplSecondary::CheckConsistency() { + mutex_.AssertHeld(); + Status s = DBImpl::CheckConsistency(); + // If DBImpl::CheckConsistency() which is stricter returns success, then we + // do not need to give a second chance. + if (s.ok()) { + return s; + } + // It's possible that DBImpl::CheckConssitency() can fail because the primary + // may have removed certain files, causing the GetFileSize(name) call to + // fail and returning a PathNotFound. In this case, we take a best-effort + // approach and just proceed. + TEST_SYNC_POINT_CALLBACK( + "DBImplSecondary::CheckConsistency:AfterFirstAttempt", &s); + + if (immutable_db_options_.skip_checking_sst_file_sizes_on_db_open) { + return Status::OK(); + } + + std::vector metadata; + versions_->GetLiveFilesMetaData(&metadata); + + std::string corruption_messages; + for (const auto& md : metadata) { + // md.name has a leading "/". + std::string file_path = md.db_path + md.name; + + uint64_t fsize = 0; + s = env_->GetFileSize(file_path, &fsize); + if (!s.ok() && + (env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok() || + s.IsPathNotFound())) { + s = Status::OK(); + } + if (!s.ok()) { + corruption_messages += + "Can't access " + md.name + ": " + s.ToString() + "\n"; + } + } + return corruption_messages.empty() ? Status::OK() + : Status::Corruption(corruption_messages); +} + +Status DBImplSecondary::TryCatchUpWithPrimary() { + assert(versions_.get() != nullptr); + assert(manifest_reader_.get() != nullptr); + Status s; + // read the manifest and apply new changes to the secondary instance + std::unordered_set cfds_changed; + JobContext job_context(0, true /*create_superversion*/); + { + InstrumentedMutexLock lock_guard(&mutex_); + s = static_cast_with_check(versions_.get()) + ->ReadAndApply(&mutex_, &manifest_reader_, &cfds_changed); + + ROCKS_LOG_INFO(immutable_db_options_.info_log, "Last sequence is %" PRIu64, + static_cast(versions_->LastSequence())); + for (ColumnFamilyData* cfd : cfds_changed) { + if (cfd->IsDropped()) { + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] is dropped\n", + cfd->GetName().c_str()); + continue; + } + VersionStorageInfo::LevelSummaryStorage tmp; + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, + "[%s] Level summary: %s\n", cfd->GetName().c_str(), + cfd->current()->storage_info()->LevelSummary(&tmp)); + } + + // list wal_dir to discover new WALs and apply new changes to the secondary + // instance + if (s.ok()) { + s = FindAndRecoverLogFiles(&cfds_changed, &job_context); + } + if (s.IsPathNotFound()) { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "Secondary tries to read WAL, but WAL file(s) have already " + "been purged by primary."); + s = Status::OK(); + } + if (s.ok()) { + for (auto cfd : cfds_changed) { + cfd->imm()->RemoveOldMemTables(cfd->GetLogNumber(), + &job_context.memtables_to_free); + auto& sv_context = job_context.superversion_contexts.back(); + cfd->InstallSuperVersion(&sv_context, &mutex_); + sv_context.NewSuperVersion(); + } + } + } + job_context.Clean(); + + // Cleanup unused, obsolete files. + JobContext purge_files_job_context(0); + { + InstrumentedMutexLock lock_guard(&mutex_); + // Currently, secondary instance does not own the database files, thus it + // is unnecessary for the secondary to force full scan. + FindObsoleteFiles(&purge_files_job_context, /*force=*/false); + } + if (purge_files_job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(purge_files_job_context); + } + purge_files_job_context.Clean(); + return s; +} + +Status DB::OpenAsSecondary(const Options& options, const std::string& dbname, + const std::string& secondary_path, DB** dbptr) { + *dbptr = nullptr; + + DBOptions db_options(options); + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.emplace_back(kDefaultColumnFamilyName, cf_options); + std::vector handles; + + Status s = DB::OpenAsSecondary(db_options, dbname, secondary_path, + column_families, &handles, dbptr); + if (s.ok()) { + assert(handles.size() == 1); + delete handles[0]; + } + return s; +} + +Status DB::OpenAsSecondary( + const DBOptions& db_options, const std::string& dbname, + const std::string& secondary_path, + const std::vector& column_families, + std::vector* handles, DB** dbptr) { + *dbptr = nullptr; + if (db_options.max_open_files != -1) { + // TODO (yanqin) maybe support max_open_files != -1 by creating hard links + // on SST files so that db secondary can still have access to old SSTs + // while primary instance may delete original. + return Status::InvalidArgument("require max_open_files to be -1"); + } + + DBOptions tmp_opts(db_options); + Status s; + if (nullptr == tmp_opts.info_log) { + s = CreateLoggerFromOptions(secondary_path, tmp_opts, &tmp_opts.info_log); + if (!s.ok()) { + tmp_opts.info_log = nullptr; + } + } + + handles->clear(); + DBImplSecondary* impl = new DBImplSecondary(tmp_opts, dbname); + impl->versions_.reset(new ReactiveVersionSet( + dbname, &impl->immutable_db_options_, impl->file_options_, + impl->table_cache_.get(), impl->write_buffer_manager_, + &impl->write_controller_)); + impl->column_family_memtables_.reset( + new ColumnFamilyMemTablesImpl(impl->versions_->GetColumnFamilySet())); + impl->wal_in_db_path_ = IsWalDirSameAsDBPath(&impl->immutable_db_options_); + + impl->mutex_.Lock(); + s = impl->Recover(column_families, true, false, false); + if (s.ok()) { + for (auto cf : column_families) { + auto cfd = + impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name); + if (nullptr == cfd) { + s = Status::InvalidArgument("Column family not found: ", cf.name); + break; + } + handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_)); + } + } + SuperVersionContext sv_context(true /* create_superversion */); + if (s.ok()) { + for (auto cfd : *impl->versions_->GetColumnFamilySet()) { + sv_context.NewSuperVersion(); + cfd->InstallSuperVersion(&sv_context, &impl->mutex_); + } + } + impl->mutex_.Unlock(); + sv_context.Clean(); + if (s.ok()) { + *dbptr = impl; + for (auto h : *handles) { + impl->NewThreadStatusCfInfo( + reinterpret_cast(h)->cfd()); + } + } else { + for (auto h : *handles) { + delete h; + } + handles->clear(); + delete impl; + } + return s; +} +#else // !ROCKSDB_LITE + +Status DB::OpenAsSecondary(const Options& /*options*/, + const std::string& /*name*/, + const std::string& /*secondary_path*/, + DB** /*dbptr*/) { + return Status::NotSupported("Not supported in ROCKSDB_LITE."); +} + +Status DB::OpenAsSecondary( + const DBOptions& /*db_options*/, const std::string& /*dbname*/, + const std::string& /*secondary_path*/, + const std::vector& /*column_families*/, + std::vector* /*handles*/, DB** /*dbptr*/) { + return Status::NotSupported("Not supported in ROCKSDB_LITE."); +} +#endif // !ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_impl/db_impl_secondary.h b/src/rocksdb/db/db_impl/db_impl_secondary.h new file mode 100644 index 000000000..24f2e7767 --- /dev/null +++ b/src/rocksdb/db/db_impl/db_impl_secondary.h @@ -0,0 +1,333 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include +#include "db/db_impl/db_impl.h" + +namespace ROCKSDB_NAMESPACE { + +// A wrapper class to hold log reader, log reporter, log status. +class LogReaderContainer { + public: + LogReaderContainer() + : reader_(nullptr), reporter_(nullptr), status_(nullptr) {} + LogReaderContainer(Env* env, std::shared_ptr info_log, + std::string fname, + std::unique_ptr&& file_reader, + uint64_t log_number) { + LogReporter* reporter = new LogReporter(); + status_ = new Status(); + reporter->env = env; + reporter->info_log = info_log.get(); + reporter->fname = std::move(fname); + reporter->status = status_; + reporter_ = reporter; + // We intentially make log::Reader do checksumming even if + // paranoid_checks==false so that corruptions cause entire commits + // to be skipped instead of propagating bad information (like overly + // large sequence numbers). + reader_ = new log::FragmentBufferedReader(info_log, std::move(file_reader), + reporter, true /*checksum*/, + log_number); + } + log::FragmentBufferedReader* reader_; + log::Reader::Reporter* reporter_; + Status* status_; + ~LogReaderContainer() { + delete reader_; + delete reporter_; + delete status_; + } + private: + struct LogReporter : public log::Reader::Reporter { + Env* env; + Logger* info_log; + std::string fname; + Status* status; // nullptr if immutable_db_options_.paranoid_checks==false + void Corruption(size_t bytes, const Status& s) override { + ROCKS_LOG_WARN(info_log, "%s%s: dropping %d bytes; %s", + (this->status == nullptr ? "(ignoring error) " : ""), + fname.c_str(), static_cast(bytes), + s.ToString().c_str()); + if (this->status != nullptr && this->status->ok()) { + *this->status = s; + } + } + }; +}; + +// The secondary instance shares access to the storage as the primary. +// The secondary is able to read and replay changes described in both the +// MANIFEST and the WAL files without coordination with the primary. +// The secondary instance can be opened using `DB::OpenAsSecondary`. After +// that, it can call `DBImplSecondary::TryCatchUpWithPrimary` to make best +// effort attempts to catch up with the primary. +class DBImplSecondary : public DBImpl { + public: + DBImplSecondary(const DBOptions& options, const std::string& dbname); + ~DBImplSecondary() override; + + // Recover by replaying MANIFEST and WAL. Also initialize manifest_reader_ + // and log_readers_ to facilitate future operations. + Status Recover(const std::vector& column_families, + bool read_only, bool error_if_log_file_exist, + bool error_if_data_exists_in_logs, + uint64_t* = nullptr) override; + + // Implementations of the DB interface + using DB::Get; + Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value) override; + + Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value); + + using DBImpl::NewIterator; + Iterator* NewIterator(const ReadOptions&, + ColumnFamilyHandle* column_family) override; + + ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& read_options, + ColumnFamilyData* cfd, + SequenceNumber snapshot, + ReadCallback* read_callback); + + Status NewIterators(const ReadOptions& options, + const std::vector& column_families, + std::vector* iterators) override; + + using DBImpl::Put; + Status Put(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/, + const Slice& /*value*/) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::Merge; + Status Merge(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/, + const Slice& /*value*/) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::Delete; + Status Delete(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::SingleDelete; + Status SingleDelete(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + Status Write(const WriteOptions& /*options*/, + WriteBatch* /*updates*/) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::CompactRange; + Status CompactRange(const CompactRangeOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice* /*begin*/, const Slice* /*end*/) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::CompactFiles; + Status CompactFiles( + const CompactionOptions& /*compact_options*/, + ColumnFamilyHandle* /*column_family*/, + const std::vector& /*input_file_names*/, + const int /*output_level*/, const int /*output_path_id*/ = -1, + std::vector* const /*output_file_names*/ = nullptr, + CompactionJobInfo* /*compaction_job_info*/ = nullptr) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + Status DisableFileDeletions() override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + Status EnableFileDeletions(bool /*force*/) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + Status GetLiveFiles(std::vector&, + uint64_t* /*manifest_file_size*/, + bool /*flush_memtable*/ = true) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::Flush; + Status Flush(const FlushOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::SetDBOptions; + Status SetDBOptions(const std::unordered_map& + /*options_map*/) override { + // Currently not supported because changing certain options may cause + // flush/compaction. + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::SetOptions; + Status SetOptions( + ColumnFamilyHandle* /*cfd*/, + const std::unordered_map& /*options_map*/) + override { + // Currently not supported because changing certain options may cause + // flush/compaction and/or write to MANIFEST. + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::SyncWAL; + Status SyncWAL() override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DB::IngestExternalFile; + Status IngestExternalFile( + ColumnFamilyHandle* /*column_family*/, + const std::vector& /*external_files*/, + const IngestExternalFileOptions& /*ingestion_options*/) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + // Try to catch up with the primary by reading as much as possible from the + // log files until there is nothing more to read or encounters an error. If + // the amount of information in the log files to process is huge, this + // method can take long time due to all the I/O and CPU costs. + Status TryCatchUpWithPrimary() override; + + + // Try to find log reader using log_number from log_readers_ map, initialize + // if it doesn't exist + Status MaybeInitLogReader(uint64_t log_number, + log::FragmentBufferedReader** log_reader); + + // Check if all live files exist on file system and that their file sizes + // matche to the in-memory records. It is possible that some live files may + // have been deleted by the primary. In this case, CheckConsistency() does + // not flag the missing file as inconsistency. + Status CheckConsistency() override; + + protected: + // ColumnFamilyCollector is a write batch handler which does nothing + // except recording unique column family IDs + class ColumnFamilyCollector : public WriteBatch::Handler { + std::unordered_set column_family_ids_; + + Status AddColumnFamilyId(uint32_t column_family_id) { + if (column_family_ids_.find(column_family_id) == + column_family_ids_.end()) { + column_family_ids_.insert(column_family_id); + } + return Status::OK(); + } + + public: + explicit ColumnFamilyCollector() {} + + ~ColumnFamilyCollector() override {} + + Status PutCF(uint32_t column_family_id, const Slice&, + const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status DeleteCF(uint32_t column_family_id, const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status SingleDeleteCF(uint32_t column_family_id, const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status DeleteRangeCF(uint32_t column_family_id, const Slice&, + const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status MergeCF(uint32_t column_family_id, const Slice&, + const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status PutBlobIndexCF(uint32_t column_family_id, const Slice&, + const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + const std::unordered_set& column_families() const { + return column_family_ids_; + } + }; + + Status CollectColumnFamilyIdsFromWriteBatch( + const WriteBatch& batch, std::vector* column_family_ids) { + assert(column_family_ids != nullptr); + column_family_ids->clear(); + ColumnFamilyCollector handler; + Status s = batch.Iterate(&handler); + if (s.ok()) { + for (const auto& cf : handler.column_families()) { + column_family_ids->push_back(cf); + } + } + return s; + } + + bool OwnTablesAndLogs() const override { + // Currently, the secondary instance does not own the database files. It + // simply opens the files of the primary instance and tracks their file + // descriptors until they become obsolete. In the future, the secondary may + // create links to database files. OwnTablesAndLogs will return true then. + return false; + } + + private: + friend class DB; + + // No copying allowed + DBImplSecondary(const DBImplSecondary&); + void operator=(const DBImplSecondary&); + + using DBImpl::Recover; + + Status FindAndRecoverLogFiles( + std::unordered_set* cfds_changed, + JobContext* job_context); + Status FindNewLogNumbers(std::vector* logs); + // After manifest recovery, replay WALs and refresh log_readers_ if necessary + // REQUIRES: log_numbers are sorted in ascending order + Status RecoverLogFiles(const std::vector& log_numbers, + SequenceNumber* next_sequence, + std::unordered_set* cfds_changed, + JobContext* job_context); + + std::unique_ptr manifest_reader_; + std::unique_ptr manifest_reporter_; + std::unique_ptr manifest_reader_status_; + + // Cache log readers for each log number, used for continue WAL replay + // after recovery + std::map> log_readers_; + + // Current WAL number replayed for each column family. + std::unordered_map cfd_to_current_log_; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/db_impl/db_impl_write.cc b/src/rocksdb/db/db_impl/db_impl_write.cc new file mode 100644 index 000000000..8f6f685e4 --- /dev/null +++ b/src/rocksdb/db/db_impl/db_impl_write.cc @@ -0,0 +1,1839 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "db/db_impl/db_impl.h" + +#include +#include "db/error_handler.h" +#include "db/event_helpers.h" +#include "monitoring/perf_context_imp.h" +#include "options/options_helper.h" +#include "test_util/sync_point.h" + +namespace ROCKSDB_NAMESPACE { +// Convenience methods +Status DBImpl::Put(const WriteOptions& o, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& val) { + return DB::Put(o, column_family, key, val); +} + +Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& val) { + auto cfh = reinterpret_cast(column_family); + if (!cfh->cfd()->ioptions()->merge_operator) { + return Status::NotSupported("Provide a merge_operator when opening DB"); + } else { + return DB::Merge(o, column_family, key, val); + } +} + +Status DBImpl::Delete(const WriteOptions& write_options, + ColumnFamilyHandle* column_family, const Slice& key) { + return DB::Delete(write_options, column_family, key); +} + +Status DBImpl::SingleDelete(const WriteOptions& write_options, + ColumnFamilyHandle* column_family, + const Slice& key) { + return DB::SingleDelete(write_options, column_family, key); +} + +void DBImpl::SetRecoverableStatePreReleaseCallback( + PreReleaseCallback* callback) { + recoverable_state_pre_release_callback_.reset(callback); +} + +Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) { + return WriteImpl(write_options, my_batch, nullptr, nullptr); +} + +#ifndef ROCKSDB_LITE +Status DBImpl::WriteWithCallback(const WriteOptions& write_options, + WriteBatch* my_batch, + WriteCallback* callback) { + return WriteImpl(write_options, my_batch, callback, nullptr); +} +#endif // ROCKSDB_LITE + +// The main write queue. This is the only write queue that updates LastSequence. +// When using one write queue, the same sequence also indicates the last +// published sequence. +Status DBImpl::WriteImpl(const WriteOptions& write_options, + WriteBatch* my_batch, WriteCallback* callback, + uint64_t* log_used, uint64_t log_ref, + bool disable_memtable, uint64_t* seq_used, + size_t batch_cnt, + PreReleaseCallback* pre_release_callback) { + assert(!seq_per_batch_ || batch_cnt != 0); + if (my_batch == nullptr) { + return Status::Corruption("Batch is nullptr!"); + } + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + tracer_->Write(my_batch); + } + } + if (write_options.sync && write_options.disableWAL) { + return Status::InvalidArgument("Sync writes has to enable WAL."); + } + if (two_write_queues_ && immutable_db_options_.enable_pipelined_write) { + return Status::NotSupported( + "pipelined_writes is not compatible with concurrent prepares"); + } + if (seq_per_batch_ && immutable_db_options_.enable_pipelined_write) { + // TODO(yiwu): update pipeline write with seq_per_batch and batch_cnt + return Status::NotSupported( + "pipelined_writes is not compatible with seq_per_batch"); + } + if (immutable_db_options_.unordered_write && + immutable_db_options_.enable_pipelined_write) { + return Status::NotSupported( + "pipelined_writes is not compatible with unordered_write"); + } + // Otherwise IsLatestPersistentState optimization does not make sense + assert(!WriteBatchInternal::IsLatestPersistentState(my_batch) || + disable_memtable); + + Status status; + if (write_options.low_pri) { + status = ThrottleLowPriWritesIfNeeded(write_options, my_batch); + if (!status.ok()) { + return status; + } + } + + if (two_write_queues_ && disable_memtable) { + AssignOrder assign_order = + seq_per_batch_ ? kDoAssignOrder : kDontAssignOrder; + // Otherwise it is WAL-only Prepare batches in WriteCommitted policy and + // they don't consume sequence. + return WriteImplWALOnly(&nonmem_write_thread_, write_options, my_batch, + callback, log_used, log_ref, seq_used, batch_cnt, + pre_release_callback, assign_order, + kDontPublishLastSeq, disable_memtable); + } + + if (immutable_db_options_.unordered_write) { + const size_t sub_batch_cnt = batch_cnt != 0 + ? batch_cnt + // every key is a sub-batch consuming a seq + : WriteBatchInternal::Count(my_batch); + uint64_t seq; + // Use a write thread to i) optimize for WAL write, ii) publish last + // sequence in in increasing order, iii) call pre_release_callback serially + status = WriteImplWALOnly(&write_thread_, write_options, my_batch, callback, + log_used, log_ref, &seq, sub_batch_cnt, + pre_release_callback, kDoAssignOrder, + kDoPublishLastSeq, disable_memtable); + TEST_SYNC_POINT("DBImpl::WriteImpl:UnorderedWriteAfterWriteWAL"); + if (!status.ok()) { + return status; + } + if (seq_used) { + *seq_used = seq; + } + if (!disable_memtable) { + TEST_SYNC_POINT("DBImpl::WriteImpl:BeforeUnorderedWriteMemtable"); + status = UnorderedWriteMemtable(write_options, my_batch, callback, + log_ref, seq, sub_batch_cnt); + } + return status; + } + + if (immutable_db_options_.enable_pipelined_write) { + return PipelinedWriteImpl(write_options, my_batch, callback, log_used, + log_ref, disable_memtable, seq_used); + } + + PERF_TIMER_GUARD(write_pre_and_post_process_time); + WriteThread::Writer w(write_options, my_batch, callback, log_ref, + disable_memtable, batch_cnt, pre_release_callback); + + if (!write_options.disableWAL) { + RecordTick(stats_, WRITE_WITH_WAL); + } + + StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE); + + write_thread_.JoinBatchGroup(&w); + if (w.state == WriteThread::STATE_PARALLEL_MEMTABLE_WRITER) { + // we are a non-leader in a parallel group + + if (w.ShouldWriteToMemtable()) { + PERF_TIMER_STOP(write_pre_and_post_process_time); + PERF_TIMER_GUARD(write_memtable_time); + + ColumnFamilyMemTablesImpl column_family_memtables( + versions_->GetColumnFamilySet()); + w.status = WriteBatchInternal::InsertInto( + &w, w.sequence, &column_family_memtables, &flush_scheduler_, + &trim_history_scheduler_, + write_options.ignore_missing_column_families, 0 /*log_number*/, this, + true /*concurrent_memtable_writes*/, seq_per_batch_, w.batch_cnt, + batch_per_txn_, write_options.memtable_insert_hint_per_batch); + + PERF_TIMER_START(write_pre_and_post_process_time); + } + + if (write_thread_.CompleteParallelMemTableWriter(&w)) { + // we're responsible for exit batch group + // TODO(myabandeh): propagate status to write_group + auto last_sequence = w.write_group->last_sequence; + versions_->SetLastSequence(last_sequence); + MemTableInsertStatusCheck(w.status); + write_thread_.ExitAsBatchGroupFollower(&w); + } + assert(w.state == WriteThread::STATE_COMPLETED); + // STATE_COMPLETED conditional below handles exit + + status = w.FinalStatus(); + } + if (w.state == WriteThread::STATE_COMPLETED) { + if (log_used != nullptr) { + *log_used = w.log_used; + } + if (seq_used != nullptr) { + *seq_used = w.sequence; + } + // write is complete and leader has updated sequence + return w.FinalStatus(); + } + // else we are the leader of the write batch group + assert(w.state == WriteThread::STATE_GROUP_LEADER); + + // Once reaches this point, the current writer "w" will try to do its write + // job. It may also pick up some of the remaining writers in the "writers_" + // when it finds suitable, and finish them in the same write batch. + // This is how a write job could be done by the other writer. + WriteContext write_context; + WriteThread::WriteGroup write_group; + bool in_parallel_group = false; + uint64_t last_sequence = kMaxSequenceNumber; + + mutex_.Lock(); + + bool need_log_sync = write_options.sync; + bool need_log_dir_sync = need_log_sync && !log_dir_synced_; + if (!two_write_queues_ || !disable_memtable) { + // With concurrent writes we do preprocess only in the write thread that + // also does write to memtable to avoid sync issue on shared data structure + // with the other thread + + // PreprocessWrite does its own perf timing. + PERF_TIMER_STOP(write_pre_and_post_process_time); + + status = PreprocessWrite(write_options, &need_log_sync, &write_context); + if (!two_write_queues_) { + // Assign it after ::PreprocessWrite since the sequence might advance + // inside it by WriteRecoverableState + last_sequence = versions_->LastSequence(); + } + + PERF_TIMER_START(write_pre_and_post_process_time); + } + log::Writer* log_writer = logs_.back().writer; + + mutex_.Unlock(); + + // Add to log and apply to memtable. We can release the lock + // during this phase since &w is currently responsible for logging + // and protects against concurrent loggers and concurrent writes + // into memtables + + TEST_SYNC_POINT("DBImpl::WriteImpl:BeforeLeaderEnters"); + last_batch_group_size_ = + write_thread_.EnterAsBatchGroupLeader(&w, &write_group); + + if (status.ok()) { + // Rules for when we can update the memtable concurrently + // 1. supported by memtable + // 2. Puts are not okay if inplace_update_support + // 3. Merges are not okay + // + // Rules 1..2 are enforced by checking the options + // during startup (CheckConcurrentWritesSupported), so if + // options.allow_concurrent_memtable_write is true then they can be + // assumed to be true. Rule 3 is checked for each batch. We could + // relax rules 2 if we could prevent write batches from referring + // more than once to a particular key. + bool parallel = immutable_db_options_.allow_concurrent_memtable_write && + write_group.size > 1; + size_t total_count = 0; + size_t valid_batches = 0; + size_t total_byte_size = 0; + size_t pre_release_callback_cnt = 0; + for (auto* writer : write_group) { + if (writer->CheckCallback(this)) { + valid_batches += writer->batch_cnt; + if (writer->ShouldWriteToMemtable()) { + total_count += WriteBatchInternal::Count(writer->batch); + parallel = parallel && !writer->batch->HasMerge(); + } + total_byte_size = WriteBatchInternal::AppendedByteSize( + total_byte_size, WriteBatchInternal::ByteSize(writer->batch)); + if (writer->pre_release_callback) { + pre_release_callback_cnt++; + } + } + } + // Note about seq_per_batch_: either disableWAL is set for the entire write + // group or not. In either case we inc seq for each write batch with no + // failed callback. This means that there could be a batch with + // disalbe_memtable in between; although we do not write this batch to + // memtable it still consumes a seq. Otherwise, if !seq_per_batch_, we inc + // the seq per valid written key to mem. + size_t seq_inc = seq_per_batch_ ? valid_batches : total_count; + + const bool concurrent_update = two_write_queues_; + // Update stats while we are an exclusive group leader, so we know + // that nobody else can be writing to these particular stats. + // We're optimistic, updating the stats before we successfully + // commit. That lets us release our leader status early. + auto stats = default_cf_internal_stats_; + stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count, + concurrent_update); + RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count); + stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size, + concurrent_update); + RecordTick(stats_, BYTES_WRITTEN, total_byte_size); + stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1, + concurrent_update); + RecordTick(stats_, WRITE_DONE_BY_SELF); + auto write_done_by_other = write_group.size - 1; + if (write_done_by_other > 0) { + stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther, + write_done_by_other, concurrent_update); + RecordTick(stats_, WRITE_DONE_BY_OTHER, write_done_by_other); + } + RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size); + + if (write_options.disableWAL) { + has_unpersisted_data_.store(true, std::memory_order_relaxed); + } + + PERF_TIMER_STOP(write_pre_and_post_process_time); + + if (!two_write_queues_) { + if (status.ok() && !write_options.disableWAL) { + PERF_TIMER_GUARD(write_wal_time); + status = WriteToWAL(write_group, log_writer, log_used, need_log_sync, + need_log_dir_sync, last_sequence + 1); + } + } else { + if (status.ok() && !write_options.disableWAL) { + PERF_TIMER_GUARD(write_wal_time); + // LastAllocatedSequence is increased inside WriteToWAL under + // wal_write_mutex_ to ensure ordered events in WAL + status = ConcurrentWriteToWAL(write_group, log_used, &last_sequence, + seq_inc); + } else { + // Otherwise we inc seq number for memtable writes + last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc); + } + } + assert(last_sequence != kMaxSequenceNumber); + const SequenceNumber current_sequence = last_sequence + 1; + last_sequence += seq_inc; + + // PreReleaseCallback is called after WAL write and before memtable write + if (status.ok()) { + SequenceNumber next_sequence = current_sequence; + size_t index = 0; + // Note: the logic for advancing seq here must be consistent with the + // logic in WriteBatchInternal::InsertInto(write_group...) as well as + // with WriteBatchInternal::InsertInto(write_batch...) that is called on + // the merged batch during recovery from the WAL. + for (auto* writer : write_group) { + if (writer->CallbackFailed()) { + continue; + } + writer->sequence = next_sequence; + if (writer->pre_release_callback) { + Status ws = writer->pre_release_callback->Callback( + writer->sequence, disable_memtable, writer->log_used, index++, + pre_release_callback_cnt); + if (!ws.ok()) { + status = ws; + break; + } + } + if (seq_per_batch_) { + assert(writer->batch_cnt); + next_sequence += writer->batch_cnt; + } else if (writer->ShouldWriteToMemtable()) { + next_sequence += WriteBatchInternal::Count(writer->batch); + } + } + } + + if (status.ok()) { + PERF_TIMER_GUARD(write_memtable_time); + + if (!parallel) { + // w.sequence will be set inside InsertInto + w.status = WriteBatchInternal::InsertInto( + write_group, current_sequence, column_family_memtables_.get(), + &flush_scheduler_, &trim_history_scheduler_, + write_options.ignore_missing_column_families, + 0 /*recovery_log_number*/, this, parallel, seq_per_batch_, + batch_per_txn_); + } else { + write_group.last_sequence = last_sequence; + write_thread_.LaunchParallelMemTableWriters(&write_group); + in_parallel_group = true; + + // Each parallel follower is doing each own writes. The leader should + // also do its own. + if (w.ShouldWriteToMemtable()) { + ColumnFamilyMemTablesImpl column_family_memtables( + versions_->GetColumnFamilySet()); + assert(w.sequence == current_sequence); + w.status = WriteBatchInternal::InsertInto( + &w, w.sequence, &column_family_memtables, &flush_scheduler_, + &trim_history_scheduler_, + write_options.ignore_missing_column_families, 0 /*log_number*/, + this, true /*concurrent_memtable_writes*/, seq_per_batch_, + w.batch_cnt, batch_per_txn_, + write_options.memtable_insert_hint_per_batch); + } + } + if (seq_used != nullptr) { + *seq_used = w.sequence; + } + } + } + PERF_TIMER_START(write_pre_and_post_process_time); + + if (!w.CallbackFailed()) { + WriteStatusCheck(status); + } + + if (need_log_sync) { + mutex_.Lock(); + MarkLogsSynced(logfile_number_, need_log_dir_sync, status); + mutex_.Unlock(); + // Requesting sync with two_write_queues_ is expected to be very rare. We + // hence provide a simple implementation that is not necessarily efficient. + if (two_write_queues_) { + if (manual_wal_flush_) { + status = FlushWAL(true); + } else { + status = SyncWAL(); + } + } + } + + bool should_exit_batch_group = true; + if (in_parallel_group) { + // CompleteParallelWorker returns true if this thread should + // handle exit, false means somebody else did + should_exit_batch_group = write_thread_.CompleteParallelMemTableWriter(&w); + } + if (should_exit_batch_group) { + if (status.ok()) { + // Note: if we are to resume after non-OK statuses we need to revisit how + // we reacts to non-OK statuses here. + versions_->SetLastSequence(last_sequence); + } + MemTableInsertStatusCheck(w.status); + write_thread_.ExitAsBatchGroupLeader(write_group, status); + } + + if (status.ok()) { + status = w.FinalStatus(); + } + return status; +} + +Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, + WriteBatch* my_batch, WriteCallback* callback, + uint64_t* log_used, uint64_t log_ref, + bool disable_memtable, uint64_t* seq_used) { + PERF_TIMER_GUARD(write_pre_and_post_process_time); + StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE); + + WriteContext write_context; + + WriteThread::Writer w(write_options, my_batch, callback, log_ref, + disable_memtable); + write_thread_.JoinBatchGroup(&w); + if (w.state == WriteThread::STATE_GROUP_LEADER) { + WriteThread::WriteGroup wal_write_group; + if (w.callback && !w.callback->AllowWriteBatching()) { + write_thread_.WaitForMemTableWriters(); + } + mutex_.Lock(); + bool need_log_sync = !write_options.disableWAL && write_options.sync; + bool need_log_dir_sync = need_log_sync && !log_dir_synced_; + // PreprocessWrite does its own perf timing. + PERF_TIMER_STOP(write_pre_and_post_process_time); + w.status = PreprocessWrite(write_options, &need_log_sync, &write_context); + PERF_TIMER_START(write_pre_and_post_process_time); + log::Writer* log_writer = logs_.back().writer; + mutex_.Unlock(); + + // This can set non-OK status if callback fail. + last_batch_group_size_ = + write_thread_.EnterAsBatchGroupLeader(&w, &wal_write_group); + const SequenceNumber current_sequence = + write_thread_.UpdateLastSequence(versions_->LastSequence()) + 1; + size_t total_count = 0; + size_t total_byte_size = 0; + + if (w.status.ok()) { + SequenceNumber next_sequence = current_sequence; + for (auto writer : wal_write_group) { + if (writer->CheckCallback(this)) { + if (writer->ShouldWriteToMemtable()) { + writer->sequence = next_sequence; + size_t count = WriteBatchInternal::Count(writer->batch); + next_sequence += count; + total_count += count; + } + total_byte_size = WriteBatchInternal::AppendedByteSize( + total_byte_size, WriteBatchInternal::ByteSize(writer->batch)); + } + } + if (w.disable_wal) { + has_unpersisted_data_.store(true, std::memory_order_relaxed); + } + write_thread_.UpdateLastSequence(current_sequence + total_count - 1); + } + + auto stats = default_cf_internal_stats_; + stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count); + RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count); + stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size); + RecordTick(stats_, BYTES_WRITTEN, total_byte_size); + RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size); + + PERF_TIMER_STOP(write_pre_and_post_process_time); + + if (w.status.ok() && !write_options.disableWAL) { + PERF_TIMER_GUARD(write_wal_time); + stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1); + RecordTick(stats_, WRITE_DONE_BY_SELF, 1); + if (wal_write_group.size > 1) { + stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther, + wal_write_group.size - 1); + RecordTick(stats_, WRITE_DONE_BY_OTHER, wal_write_group.size - 1); + } + w.status = WriteToWAL(wal_write_group, log_writer, log_used, + need_log_sync, need_log_dir_sync, current_sequence); + } + + if (!w.CallbackFailed()) { + WriteStatusCheck(w.status); + } + + if (need_log_sync) { + mutex_.Lock(); + MarkLogsSynced(logfile_number_, need_log_dir_sync, w.status); + mutex_.Unlock(); + } + + write_thread_.ExitAsBatchGroupLeader(wal_write_group, w.status); + } + + WriteThread::WriteGroup memtable_write_group; + if (w.state == WriteThread::STATE_MEMTABLE_WRITER_LEADER) { + PERF_TIMER_GUARD(write_memtable_time); + assert(w.ShouldWriteToMemtable()); + write_thread_.EnterAsMemTableWriter(&w, &memtable_write_group); + if (memtable_write_group.size > 1 && + immutable_db_options_.allow_concurrent_memtable_write) { + write_thread_.LaunchParallelMemTableWriters(&memtable_write_group); + } else { + memtable_write_group.status = WriteBatchInternal::InsertInto( + memtable_write_group, w.sequence, column_family_memtables_.get(), + &flush_scheduler_, &trim_history_scheduler_, + write_options.ignore_missing_column_families, 0 /*log_number*/, this, + false /*concurrent_memtable_writes*/, seq_per_batch_, batch_per_txn_); + versions_->SetLastSequence(memtable_write_group.last_sequence); + write_thread_.ExitAsMemTableWriter(&w, memtable_write_group); + } + } + + if (w.state == WriteThread::STATE_PARALLEL_MEMTABLE_WRITER) { + assert(w.ShouldWriteToMemtable()); + ColumnFamilyMemTablesImpl column_family_memtables( + versions_->GetColumnFamilySet()); + w.status = WriteBatchInternal::InsertInto( + &w, w.sequence, &column_family_memtables, &flush_scheduler_, + &trim_history_scheduler_, write_options.ignore_missing_column_families, + 0 /*log_number*/, this, true /*concurrent_memtable_writes*/, + false /*seq_per_batch*/, 0 /*batch_cnt*/, true /*batch_per_txn*/, + write_options.memtable_insert_hint_per_batch); + if (write_thread_.CompleteParallelMemTableWriter(&w)) { + MemTableInsertStatusCheck(w.status); + versions_->SetLastSequence(w.write_group->last_sequence); + write_thread_.ExitAsMemTableWriter(&w, *w.write_group); + } + } + if (seq_used != nullptr) { + *seq_used = w.sequence; + } + + assert(w.state == WriteThread::STATE_COMPLETED); + return w.FinalStatus(); +} + +Status DBImpl::UnorderedWriteMemtable(const WriteOptions& write_options, + WriteBatch* my_batch, + WriteCallback* callback, uint64_t log_ref, + SequenceNumber seq, + const size_t sub_batch_cnt) { + PERF_TIMER_GUARD(write_pre_and_post_process_time); + StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE); + + WriteThread::Writer w(write_options, my_batch, callback, log_ref, + false /*disable_memtable*/); + + if (w.CheckCallback(this) && w.ShouldWriteToMemtable()) { + w.sequence = seq; + size_t total_count = WriteBatchInternal::Count(my_batch); + InternalStats* stats = default_cf_internal_stats_; + stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count); + RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count); + + ColumnFamilyMemTablesImpl column_family_memtables( + versions_->GetColumnFamilySet()); + w.status = WriteBatchInternal::InsertInto( + &w, w.sequence, &column_family_memtables, &flush_scheduler_, + &trim_history_scheduler_, write_options.ignore_missing_column_families, + 0 /*log_number*/, this, true /*concurrent_memtable_writes*/, + seq_per_batch_, sub_batch_cnt, true /*batch_per_txn*/, + write_options.memtable_insert_hint_per_batch); + + WriteStatusCheck(w.status); + if (write_options.disableWAL) { + has_unpersisted_data_.store(true, std::memory_order_relaxed); + } + } + + size_t pending_cnt = pending_memtable_writes_.fetch_sub(1) - 1; + if (pending_cnt == 0) { + // switch_cv_ waits until pending_memtable_writes_ = 0. Locking its mutex + // before notify ensures that cv is in waiting state when it is notified + // thus not missing the update to pending_memtable_writes_ even though it is + // not modified under the mutex. + std::lock_guard lck(switch_mutex_); + switch_cv_.notify_all(); + } + + if (!w.FinalStatus().ok()) { + return w.FinalStatus(); + } + return Status::OK(); +} + +// The 2nd write queue. If enabled it will be used only for WAL-only writes. +// This is the only queue that updates LastPublishedSequence which is only +// applicable in a two-queue setting. +Status DBImpl::WriteImplWALOnly( + WriteThread* write_thread, const WriteOptions& write_options, + WriteBatch* my_batch, WriteCallback* callback, uint64_t* log_used, + const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt, + PreReleaseCallback* pre_release_callback, const AssignOrder assign_order, + const PublishLastSeq publish_last_seq, const bool disable_memtable) { + Status status; + PERF_TIMER_GUARD(write_pre_and_post_process_time); + WriteThread::Writer w(write_options, my_batch, callback, log_ref, + disable_memtable, sub_batch_cnt, pre_release_callback); + RecordTick(stats_, WRITE_WITH_WAL); + StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE); + + write_thread->JoinBatchGroup(&w); + assert(w.state != WriteThread::STATE_PARALLEL_MEMTABLE_WRITER); + if (w.state == WriteThread::STATE_COMPLETED) { + if (log_used != nullptr) { + *log_used = w.log_used; + } + if (seq_used != nullptr) { + *seq_used = w.sequence; + } + return w.FinalStatus(); + } + // else we are the leader of the write batch group + assert(w.state == WriteThread::STATE_GROUP_LEADER); + + if (publish_last_seq == kDoPublishLastSeq) { + // Currently we only use kDoPublishLastSeq in unordered_write + assert(immutable_db_options_.unordered_write); + WriteContext write_context; + if (error_handler_.IsDBStopped()) { + status = error_handler_.GetBGError(); + } + // TODO(myabandeh): Make preliminary checks thread-safe so we could do them + // without paying the cost of obtaining the mutex. + if (status.ok()) { + InstrumentedMutexLock l(&mutex_); + bool need_log_sync = false; + status = PreprocessWrite(write_options, &need_log_sync, &write_context); + WriteStatusCheck(status); + } + if (!status.ok()) { + WriteThread::WriteGroup write_group; + write_thread->EnterAsBatchGroupLeader(&w, &write_group); + write_thread->ExitAsBatchGroupLeader(write_group, status); + return status; + } + } + + WriteThread::WriteGroup write_group; + uint64_t last_sequence; + write_thread->EnterAsBatchGroupLeader(&w, &write_group); + // Note: no need to update last_batch_group_size_ here since the batch writes + // to WAL only + + size_t pre_release_callback_cnt = 0; + size_t total_byte_size = 0; + for (auto* writer : write_group) { + if (writer->CheckCallback(this)) { + total_byte_size = WriteBatchInternal::AppendedByteSize( + total_byte_size, WriteBatchInternal::ByteSize(writer->batch)); + if (writer->pre_release_callback) { + pre_release_callback_cnt++; + } + } + } + + const bool concurrent_update = true; + // Update stats while we are an exclusive group leader, so we know + // that nobody else can be writing to these particular stats. + // We're optimistic, updating the stats before we successfully + // commit. That lets us release our leader status early. + auto stats = default_cf_internal_stats_; + stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size, + concurrent_update); + RecordTick(stats_, BYTES_WRITTEN, total_byte_size); + stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1, + concurrent_update); + RecordTick(stats_, WRITE_DONE_BY_SELF); + auto write_done_by_other = write_group.size - 1; + if (write_done_by_other > 0) { + stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther, + write_done_by_other, concurrent_update); + RecordTick(stats_, WRITE_DONE_BY_OTHER, write_done_by_other); + } + RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size); + + PERF_TIMER_STOP(write_pre_and_post_process_time); + + PERF_TIMER_GUARD(write_wal_time); + // LastAllocatedSequence is increased inside WriteToWAL under + // wal_write_mutex_ to ensure ordered events in WAL + size_t seq_inc = 0 /* total_count */; + if (assign_order == kDoAssignOrder) { + size_t total_batch_cnt = 0; + for (auto* writer : write_group) { + assert(writer->batch_cnt || !seq_per_batch_); + if (!writer->CallbackFailed()) { + total_batch_cnt += writer->batch_cnt; + } + } + seq_inc = total_batch_cnt; + } + if (!write_options.disableWAL) { + status = + ConcurrentWriteToWAL(write_group, log_used, &last_sequence, seq_inc); + } else { + // Otherwise we inc seq number to do solely the seq allocation + last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc); + } + + size_t memtable_write_cnt = 0; + auto curr_seq = last_sequence + 1; + for (auto* writer : write_group) { + if (writer->CallbackFailed()) { + continue; + } + writer->sequence = curr_seq; + if (assign_order == kDoAssignOrder) { + assert(writer->batch_cnt || !seq_per_batch_); + curr_seq += writer->batch_cnt; + } + if (!writer->disable_memtable) { + memtable_write_cnt++; + } + // else seq advances only by memtable writes + } + if (status.ok() && write_options.sync) { + assert(!write_options.disableWAL); + // Requesting sync with two_write_queues_ is expected to be very rare. We + // hance provide a simple implementation that is not necessarily efficient. + if (manual_wal_flush_) { + status = FlushWAL(true); + } else { + status = SyncWAL(); + } + } + PERF_TIMER_START(write_pre_and_post_process_time); + + if (!w.CallbackFailed()) { + WriteStatusCheck(status); + } + if (status.ok()) { + size_t index = 0; + for (auto* writer : write_group) { + if (!writer->CallbackFailed() && writer->pre_release_callback) { + assert(writer->sequence != kMaxSequenceNumber); + Status ws = writer->pre_release_callback->Callback( + writer->sequence, disable_memtable, writer->log_used, index++, + pre_release_callback_cnt); + if (!ws.ok()) { + status = ws; + break; + } + } + } + } + if (publish_last_seq == kDoPublishLastSeq) { + versions_->SetLastSequence(last_sequence + seq_inc); + // Currently we only use kDoPublishLastSeq in unordered_write + assert(immutable_db_options_.unordered_write); + } + if (immutable_db_options_.unordered_write && status.ok()) { + pending_memtable_writes_ += memtable_write_cnt; + } + write_thread->ExitAsBatchGroupLeader(write_group, status); + if (status.ok()) { + status = w.FinalStatus(); + } + if (seq_used != nullptr) { + *seq_used = w.sequence; + } + return status; +} + +void DBImpl::WriteStatusCheck(const Status& status) { + // Is setting bg_error_ enough here? This will at least stop + // compaction and fail any further writes. + if (immutable_db_options_.paranoid_checks && !status.ok() && + !status.IsBusy() && !status.IsIncomplete()) { + mutex_.Lock(); + error_handler_.SetBGError(status, BackgroundErrorReason::kWriteCallback); + mutex_.Unlock(); + } +} + +void DBImpl::MemTableInsertStatusCheck(const Status& status) { + // A non-OK status here indicates that the state implied by the + // WAL has diverged from the in-memory state. This could be + // because of a corrupt write_batch (very bad), or because the + // client specified an invalid column family and didn't specify + // ignore_missing_column_families. + if (!status.ok()) { + mutex_.Lock(); + assert(!error_handler_.IsBGWorkStopped()); + error_handler_.SetBGError(status, BackgroundErrorReason::kMemTable); + mutex_.Unlock(); + } +} + +Status DBImpl::PreprocessWrite(const WriteOptions& write_options, + bool* need_log_sync, + WriteContext* write_context) { + mutex_.AssertHeld(); + assert(write_context != nullptr && need_log_sync != nullptr); + Status status; + + if (error_handler_.IsDBStopped()) { + status = error_handler_.GetBGError(); + } + + PERF_TIMER_GUARD(write_scheduling_flushes_compactions_time); + + assert(!single_column_family_mode_ || + versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1); + if (UNLIKELY(status.ok() && !single_column_family_mode_ && + total_log_size_ > GetMaxTotalWalSize())) { + WaitForPendingWrites(); + status = SwitchWAL(write_context); + } + + if (UNLIKELY(status.ok() && write_buffer_manager_->ShouldFlush())) { + // Before a new memtable is added in SwitchMemtable(), + // write_buffer_manager_->ShouldFlush() will keep returning true. If another + // thread is writing to another DB with the same write buffer, they may also + // be flushed. We may end up with flushing much more DBs than needed. It's + // suboptimal but still correct. + WaitForPendingWrites(); + status = HandleWriteBufferFull(write_context); + } + + if (UNLIKELY(status.ok() && !trim_history_scheduler_.Empty())) { + status = TrimMemtableHistory(write_context); + } + + if (UNLIKELY(status.ok() && !flush_scheduler_.Empty())) { + WaitForPendingWrites(); + status = ScheduleFlushes(write_context); + } + + PERF_TIMER_STOP(write_scheduling_flushes_compactions_time); + PERF_TIMER_GUARD(write_pre_and_post_process_time); + + if (UNLIKELY(status.ok() && (write_controller_.IsStopped() || + write_controller_.NeedsDelay()))) { + PERF_TIMER_STOP(write_pre_and_post_process_time); + PERF_TIMER_GUARD(write_delay_time); + // We don't know size of curent batch so that we always use the size + // for previous one. It might create a fairness issue that expiration + // might happen for smaller writes but larger writes can go through. + // Can optimize it if it is an issue. + status = DelayWrite(last_batch_group_size_, write_options); + PERF_TIMER_START(write_pre_and_post_process_time); + } + + if (status.ok() && *need_log_sync) { + // Wait until the parallel syncs are finished. Any sync process has to sync + // the front log too so it is enough to check the status of front() + // We do a while loop since log_sync_cv_ is signalled when any sync is + // finished + // Note: there does not seem to be a reason to wait for parallel sync at + // this early step but it is not important since parallel sync (SyncWAL) and + // need_log_sync are usually not used together. + while (logs_.front().getting_synced) { + log_sync_cv_.Wait(); + } + for (auto& log : logs_) { + assert(!log.getting_synced); + // This is just to prevent the logs to be synced by a parallel SyncWAL + // call. We will do the actual syncing later after we will write to the + // WAL. + // Note: there does not seem to be a reason to set this early before we + // actually write to the WAL + log.getting_synced = true; + } + } else { + *need_log_sync = false; + } + + return status; +} + +WriteBatch* DBImpl::MergeBatch(const WriteThread::WriteGroup& write_group, + WriteBatch* tmp_batch, size_t* write_with_wal, + WriteBatch** to_be_cached_state) { + assert(write_with_wal != nullptr); + assert(tmp_batch != nullptr); + assert(*to_be_cached_state == nullptr); + WriteBatch* merged_batch = nullptr; + *write_with_wal = 0; + auto* leader = write_group.leader; + assert(!leader->disable_wal); // Same holds for all in the batch group + if (write_group.size == 1 && !leader->CallbackFailed() && + leader->batch->GetWalTerminationPoint().is_cleared()) { + // we simply write the first WriteBatch to WAL if the group only + // contains one batch, that batch should be written to the WAL, + // and the batch is not wanting to be truncated + merged_batch = leader->batch; + if (WriteBatchInternal::IsLatestPersistentState(merged_batch)) { + *to_be_cached_state = merged_batch; + } + *write_with_wal = 1; + } else { + // WAL needs all of the batches flattened into a single batch. + // We could avoid copying here with an iov-like AddRecord + // interface + merged_batch = tmp_batch; + for (auto writer : write_group) { + if (!writer->CallbackFailed()) { + WriteBatchInternal::Append(merged_batch, writer->batch, + /*WAL_only*/ true); + if (WriteBatchInternal::IsLatestPersistentState(writer->batch)) { + // We only need to cache the last of such write batch + *to_be_cached_state = writer->batch; + } + (*write_with_wal)++; + } + } + } + return merged_batch; +} + +// When two_write_queues_ is disabled, this function is called from the only +// write thread. Otherwise this must be called holding log_write_mutex_. +Status DBImpl::WriteToWAL(const WriteBatch& merged_batch, + log::Writer* log_writer, uint64_t* log_used, + uint64_t* log_size) { + assert(log_size != nullptr); + Slice log_entry = WriteBatchInternal::Contents(&merged_batch); + *log_size = log_entry.size(); + // When two_write_queues_ WriteToWAL has to be protected from concurretn calls + // from the two queues anyway and log_write_mutex_ is already held. Otherwise + // if manual_wal_flush_ is enabled we need to protect log_writer->AddRecord + // from possible concurrent calls via the FlushWAL by the application. + const bool needs_locking = manual_wal_flush_ && !two_write_queues_; + // Due to performance cocerns of missed branch prediction penalize the new + // manual_wal_flush_ feature (by UNLIKELY) instead of the more common case + // when we do not need any locking. + if (UNLIKELY(needs_locking)) { + log_write_mutex_.Lock(); + } + Status status = log_writer->AddRecord(log_entry); + if (UNLIKELY(needs_locking)) { + log_write_mutex_.Unlock(); + } + if (log_used != nullptr) { + *log_used = logfile_number_; + } + total_log_size_ += log_entry.size(); + // TODO(myabandeh): it might be unsafe to access alive_log_files_.back() here + // since alive_log_files_ might be modified concurrently + alive_log_files_.back().AddSize(log_entry.size()); + log_empty_ = false; + return status; +} + +Status DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group, + log::Writer* log_writer, uint64_t* log_used, + bool need_log_sync, bool need_log_dir_sync, + SequenceNumber sequence) { + Status status; + + assert(!write_group.leader->disable_wal); + // Same holds for all in the batch group + size_t write_with_wal = 0; + WriteBatch* to_be_cached_state = nullptr; + WriteBatch* merged_batch = MergeBatch(write_group, &tmp_batch_, + &write_with_wal, &to_be_cached_state); + if (merged_batch == write_group.leader->batch) { + write_group.leader->log_used = logfile_number_; + } else if (write_with_wal > 1) { + for (auto writer : write_group) { + writer->log_used = logfile_number_; + } + } + + WriteBatchInternal::SetSequence(merged_batch, sequence); + + uint64_t log_size; + status = WriteToWAL(*merged_batch, log_writer, log_used, &log_size); + if (to_be_cached_state) { + cached_recoverable_state_ = *to_be_cached_state; + cached_recoverable_state_empty_ = false; + } + + if (status.ok() && need_log_sync) { + StopWatch sw(env_, stats_, WAL_FILE_SYNC_MICROS); + // It's safe to access logs_ with unlocked mutex_ here because: + // - we've set getting_synced=true for all logs, + // so other threads won't pop from logs_ while we're here, + // - only writer thread can push to logs_, and we're in + // writer thread, so no one will push to logs_, + // - as long as other threads don't modify it, it's safe to read + // from std::deque from multiple threads concurrently. + for (auto& log : logs_) { + status = log.writer->file()->Sync(immutable_db_options_.use_fsync); + if (!status.ok()) { + break; + } + } + if (status.ok() && need_log_dir_sync) { + // We only sync WAL directory the first time WAL syncing is + // requested, so that in case users never turn on WAL sync, + // we can avoid the disk I/O in the write code path. + status = directories_.GetWalDir()->Fsync(); + } + } + + if (merged_batch == &tmp_batch_) { + tmp_batch_.Clear(); + } + if (status.ok()) { + auto stats = default_cf_internal_stats_; + if (need_log_sync) { + stats->AddDBStats(InternalStats::kIntStatsWalFileSynced, 1); + RecordTick(stats_, WAL_FILE_SYNCED); + } + stats->AddDBStats(InternalStats::kIntStatsWalFileBytes, log_size); + RecordTick(stats_, WAL_FILE_BYTES, log_size); + stats->AddDBStats(InternalStats::kIntStatsWriteWithWal, write_with_wal); + RecordTick(stats_, WRITE_WITH_WAL, write_with_wal); + } + return status; +} + +Status DBImpl::ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group, + uint64_t* log_used, + SequenceNumber* last_sequence, + size_t seq_inc) { + Status status; + + assert(!write_group.leader->disable_wal); + // Same holds for all in the batch group + WriteBatch tmp_batch; + size_t write_with_wal = 0; + WriteBatch* to_be_cached_state = nullptr; + WriteBatch* merged_batch = + MergeBatch(write_group, &tmp_batch, &write_with_wal, &to_be_cached_state); + + // We need to lock log_write_mutex_ since logs_ and alive_log_files might be + // pushed back concurrently + log_write_mutex_.Lock(); + if (merged_batch == write_group.leader->batch) { + write_group.leader->log_used = logfile_number_; + } else if (write_with_wal > 1) { + for (auto writer : write_group) { + writer->log_used = logfile_number_; + } + } + *last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc); + auto sequence = *last_sequence + 1; + WriteBatchInternal::SetSequence(merged_batch, sequence); + + log::Writer* log_writer = logs_.back().writer; + uint64_t log_size; + status = WriteToWAL(*merged_batch, log_writer, log_used, &log_size); + if (to_be_cached_state) { + cached_recoverable_state_ = *to_be_cached_state; + cached_recoverable_state_empty_ = false; + } + log_write_mutex_.Unlock(); + + if (status.ok()) { + const bool concurrent = true; + auto stats = default_cf_internal_stats_; + stats->AddDBStats(InternalStats::kIntStatsWalFileBytes, log_size, + concurrent); + RecordTick(stats_, WAL_FILE_BYTES, log_size); + stats->AddDBStats(InternalStats::kIntStatsWriteWithWal, write_with_wal, + concurrent); + RecordTick(stats_, WRITE_WITH_WAL, write_with_wal); + } + return status; +} + +Status DBImpl::WriteRecoverableState() { + mutex_.AssertHeld(); + if (!cached_recoverable_state_empty_) { + bool dont_care_bool; + SequenceNumber next_seq; + if (two_write_queues_) { + log_write_mutex_.Lock(); + } + SequenceNumber seq; + if (two_write_queues_) { + seq = versions_->FetchAddLastAllocatedSequence(0); + } else { + seq = versions_->LastSequence(); + } + WriteBatchInternal::SetSequence(&cached_recoverable_state_, seq + 1); + auto status = WriteBatchInternal::InsertInto( + &cached_recoverable_state_, column_family_memtables_.get(), + &flush_scheduler_, &trim_history_scheduler_, true, + 0 /*recovery_log_number*/, this, false /* concurrent_memtable_writes */, + &next_seq, &dont_care_bool, seq_per_batch_); + auto last_seq = next_seq - 1; + if (two_write_queues_) { + versions_->FetchAddLastAllocatedSequence(last_seq - seq); + versions_->SetLastPublishedSequence(last_seq); + } + versions_->SetLastSequence(last_seq); + if (two_write_queues_) { + log_write_mutex_.Unlock(); + } + if (status.ok() && recoverable_state_pre_release_callback_) { + const bool DISABLE_MEMTABLE = true; + for (uint64_t sub_batch_seq = seq + 1; + sub_batch_seq < next_seq && status.ok(); sub_batch_seq++) { + uint64_t const no_log_num = 0; + // Unlock it since the callback might end up locking mutex. e.g., + // AddCommitted -> AdvanceMaxEvictedSeq -> GetSnapshotListFromDB + mutex_.Unlock(); + status = recoverable_state_pre_release_callback_->Callback( + sub_batch_seq, !DISABLE_MEMTABLE, no_log_num, 0, 1); + mutex_.Lock(); + } + } + if (status.ok()) { + cached_recoverable_state_.Clear(); + cached_recoverable_state_empty_ = true; + } + return status; + } + return Status::OK(); +} + +void DBImpl::SelectColumnFamiliesForAtomicFlush( + autovector* cfds) { + for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) { + if (cfd->IsDropped()) { + continue; + } + if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() || + !cached_recoverable_state_empty_.load()) { + cfds->push_back(cfd); + } + } +} + +// Assign sequence number for atomic flush. +void DBImpl::AssignAtomicFlushSeq(const autovector& cfds) { + assert(immutable_db_options_.atomic_flush); + auto seq = versions_->LastSequence(); + for (auto cfd : cfds) { + cfd->imm()->AssignAtomicFlushSeq(seq); + } +} + +Status DBImpl::SwitchWAL(WriteContext* write_context) { + mutex_.AssertHeld(); + assert(write_context != nullptr); + Status status; + + if (alive_log_files_.begin()->getting_flushed) { + return status; + } + + auto oldest_alive_log = alive_log_files_.begin()->number; + bool flush_wont_release_oldest_log = false; + if (allow_2pc()) { + auto oldest_log_with_uncommitted_prep = + logs_with_prep_tracker_.FindMinLogContainingOutstandingPrep(); + + assert(oldest_log_with_uncommitted_prep == 0 || + oldest_log_with_uncommitted_prep >= oldest_alive_log); + if (oldest_log_with_uncommitted_prep > 0 && + oldest_log_with_uncommitted_prep == oldest_alive_log) { + if (unable_to_release_oldest_log_) { + // we already attempted to flush all column families dependent on + // the oldest alive log but the log still contained uncommitted + // transactions so there is still nothing that we can do. + return status; + } else { + ROCKS_LOG_WARN( + immutable_db_options_.info_log, + "Unable to release oldest log due to uncommitted transaction"); + unable_to_release_oldest_log_ = true; + flush_wont_release_oldest_log = true; + } + } + } + if (!flush_wont_release_oldest_log) { + // we only mark this log as getting flushed if we have successfully + // flushed all data in this log. If this log contains outstanding prepared + // transactions then we cannot flush this log until those transactions are + // commited. + unable_to_release_oldest_log_ = false; + alive_log_files_.begin()->getting_flushed = true; + } + + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "Flushing all column families with data in WAL number %" PRIu64 + ". Total log size is %" PRIu64 " while max_total_wal_size is %" PRIu64, + oldest_alive_log, total_log_size_.load(), GetMaxTotalWalSize()); + // no need to refcount because drop is happening in write thread, so can't + // happen while we're in the write thread + autovector cfds; + if (immutable_db_options_.atomic_flush) { + SelectColumnFamiliesForAtomicFlush(&cfds); + } else { + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->IsDropped()) { + continue; + } + if (cfd->OldestLogToKeep() <= oldest_alive_log) { + cfds.push_back(cfd); + } + } + MaybeFlushStatsCF(&cfds); + } + WriteThread::Writer nonmem_w; + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + } + + for (const auto cfd : cfds) { + cfd->Ref(); + status = SwitchMemtable(cfd, write_context); + cfd->UnrefAndTryDelete(); + if (!status.ok()) { + break; + } + } + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } + + if (status.ok()) { + if (immutable_db_options_.atomic_flush) { + AssignAtomicFlushSeq(cfds); + } + for (auto cfd : cfds) { + cfd->imm()->FlushRequested(); + } + FlushRequest flush_req; + GenerateFlushRequest(cfds, &flush_req); + SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager); + MaybeScheduleFlushOrCompaction(); + } + return status; +} + +Status DBImpl::HandleWriteBufferFull(WriteContext* write_context) { + mutex_.AssertHeld(); + assert(write_context != nullptr); + Status status; + + // Before a new memtable is added in SwitchMemtable(), + // write_buffer_manager_->ShouldFlush() will keep returning true. If another + // thread is writing to another DB with the same write buffer, they may also + // be flushed. We may end up with flushing much more DBs than needed. It's + // suboptimal but still correct. + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "Flushing column family with oldest memtable entry. Write buffer is " + "using %" ROCKSDB_PRIszt " bytes out of a total of %" ROCKSDB_PRIszt ".", + write_buffer_manager_->memory_usage(), + write_buffer_manager_->buffer_size()); + // no need to refcount because drop is happening in write thread, so can't + // happen while we're in the write thread + autovector cfds; + if (immutable_db_options_.atomic_flush) { + SelectColumnFamiliesForAtomicFlush(&cfds); + } else { + ColumnFamilyData* cfd_picked = nullptr; + SequenceNumber seq_num_for_cf_picked = kMaxSequenceNumber; + + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->IsDropped()) { + continue; + } + if (!cfd->mem()->IsEmpty()) { + // We only consider active mem table, hoping immutable memtable is + // already in the process of flushing. + uint64_t seq = cfd->mem()->GetCreationSeq(); + if (cfd_picked == nullptr || seq < seq_num_for_cf_picked) { + cfd_picked = cfd; + seq_num_for_cf_picked = seq; + } + } + } + if (cfd_picked != nullptr) { + cfds.push_back(cfd_picked); + } + MaybeFlushStatsCF(&cfds); + } + + WriteThread::Writer nonmem_w; + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + } + for (const auto cfd : cfds) { + if (cfd->mem()->IsEmpty()) { + continue; + } + cfd->Ref(); + status = SwitchMemtable(cfd, write_context); + cfd->UnrefAndTryDelete(); + if (!status.ok()) { + break; + } + } + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } + + if (status.ok()) { + if (immutable_db_options_.atomic_flush) { + AssignAtomicFlushSeq(cfds); + } + for (const auto cfd : cfds) { + cfd->imm()->FlushRequested(); + } + FlushRequest flush_req; + GenerateFlushRequest(cfds, &flush_req); + SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull); + MaybeScheduleFlushOrCompaction(); + } + return status; +} + +uint64_t DBImpl::GetMaxTotalWalSize() const { + mutex_.AssertHeld(); + return mutable_db_options_.max_total_wal_size == 0 + ? 4 * max_total_in_memory_state_ + : mutable_db_options_.max_total_wal_size; +} + +// REQUIRES: mutex_ is held +// REQUIRES: this thread is currently at the front of the writer queue +Status DBImpl::DelayWrite(uint64_t num_bytes, + const WriteOptions& write_options) { + uint64_t time_delayed = 0; + bool delayed = false; + { + StopWatch sw(env_, stats_, WRITE_STALL, &time_delayed); + uint64_t delay = write_controller_.GetDelay(env_, num_bytes); + if (delay > 0) { + if (write_options.no_slowdown) { + return Status::Incomplete("Write stall"); + } + TEST_SYNC_POINT("DBImpl::DelayWrite:Sleep"); + + // Notify write_thread_ about the stall so it can setup a barrier and + // fail any pending writers with no_slowdown + write_thread_.BeginWriteStall(); + TEST_SYNC_POINT("DBImpl::DelayWrite:BeginWriteStallDone"); + mutex_.Unlock(); + // We will delay the write until we have slept for delay ms or + // we don't need a delay anymore + const uint64_t kDelayInterval = 1000; + uint64_t stall_end = sw.start_time() + delay; + while (write_controller_.NeedsDelay()) { + if (env_->NowMicros() >= stall_end) { + // We already delayed this write `delay` microseconds + break; + } + + delayed = true; + // Sleep for 0.001 seconds + env_->SleepForMicroseconds(kDelayInterval); + } + mutex_.Lock(); + write_thread_.EndWriteStall(); + } + + // Don't wait if there's a background error, even if its a soft error. We + // might wait here indefinitely as the background compaction may never + // finish successfully, resulting in the stall condition lasting + // indefinitely + while (error_handler_.GetBGError().ok() && write_controller_.IsStopped()) { + if (write_options.no_slowdown) { + return Status::Incomplete("Write stall"); + } + delayed = true; + + // Notify write_thread_ about the stall so it can setup a barrier and + // fail any pending writers with no_slowdown + write_thread_.BeginWriteStall(); + TEST_SYNC_POINT("DBImpl::DelayWrite:Wait"); + bg_cv_.Wait(); + write_thread_.EndWriteStall(); + } + } + assert(!delayed || !write_options.no_slowdown); + if (delayed) { + default_cf_internal_stats_->AddDBStats( + InternalStats::kIntStatsWriteStallMicros, time_delayed); + RecordTick(stats_, STALL_MICROS, time_delayed); + } + + // If DB is not in read-only mode and write_controller is not stopping + // writes, we can ignore any background errors and allow the write to + // proceed + Status s; + if (write_controller_.IsStopped()) { + // If writes are still stopped, it means we bailed due to a background + // error + s = Status::Incomplete(error_handler_.GetBGError().ToString()); + } + if (error_handler_.IsDBStopped()) { + s = error_handler_.GetBGError(); + } + return s; +} + +Status DBImpl::ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options, + WriteBatch* my_batch) { + assert(write_options.low_pri); + // This is called outside the DB mutex. Although it is safe to make the call, + // the consistency condition is not guaranteed to hold. It's OK to live with + // it in this case. + // If we need to speed compaction, it means the compaction is left behind + // and we start to limit low pri writes to a limit. + if (write_controller_.NeedSpeedupCompaction()) { + if (allow_2pc() && (my_batch->HasCommit() || my_batch->HasRollback())) { + // For 2PC, we only rate limit prepare, not commit. + return Status::OK(); + } + if (write_options.no_slowdown) { + return Status::Incomplete("Low priority write stall"); + } else { + assert(my_batch != nullptr); + // Rate limit those writes. The reason that we don't completely wait + // is that in case the write is heavy, low pri writes may never have + // a chance to run. Now we guarantee we are still slowly making + // progress. + PERF_TIMER_GUARD(write_delay_time); + write_controller_.low_pri_rate_limiter()->Request( + my_batch->GetDataSize(), Env::IO_HIGH, nullptr /* stats */, + RateLimiter::OpType::kWrite); + } + } + return Status::OK(); +} + +void DBImpl::MaybeFlushStatsCF(autovector* cfds) { + assert(cfds != nullptr); + if (!cfds->empty() && immutable_db_options_.persist_stats_to_disk) { + ColumnFamilyData* cfd_stats = + versions_->GetColumnFamilySet()->GetColumnFamily( + kPersistentStatsColumnFamilyName); + if (cfd_stats != nullptr && !cfd_stats->mem()->IsEmpty()) { + for (ColumnFamilyData* cfd : *cfds) { + if (cfd == cfd_stats) { + // stats CF already included in cfds + return; + } + } + // force flush stats CF when its log number is less than all other CF's + // log numbers + bool force_flush_stats_cf = true; + for (auto* loop_cfd : *versions_->GetColumnFamilySet()) { + if (loop_cfd == cfd_stats) { + continue; + } + if (loop_cfd->GetLogNumber() <= cfd_stats->GetLogNumber()) { + force_flush_stats_cf = false; + } + } + if (force_flush_stats_cf) { + cfds->push_back(cfd_stats); + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Force flushing stats CF with automated flush " + "to avoid holding old logs"); + } + } + } +} + +Status DBImpl::TrimMemtableHistory(WriteContext* context) { + autovector cfds; + ColumnFamilyData* tmp_cfd; + while ((tmp_cfd = trim_history_scheduler_.TakeNextColumnFamily()) != + nullptr) { + cfds.push_back(tmp_cfd); + } + for (auto& cfd : cfds) { + autovector to_delete; + cfd->imm()->TrimHistory(&to_delete, cfd->mem()->ApproximateMemoryUsage()); + if (!to_delete.empty()) { + for (auto m : to_delete) { + delete m; + } + context->superversion_context.NewSuperVersion(); + assert(context->superversion_context.new_superversion.get() != nullptr); + cfd->InstallSuperVersion(&context->superversion_context, &mutex_); + } + + if (cfd->UnrefAndTryDelete()) { + cfd = nullptr; + } + } + return Status::OK(); +} + +Status DBImpl::ScheduleFlushes(WriteContext* context) { + autovector cfds; + if (immutable_db_options_.atomic_flush) { + SelectColumnFamiliesForAtomicFlush(&cfds); + for (auto cfd : cfds) { + cfd->Ref(); + } + flush_scheduler_.Clear(); + } else { + ColumnFamilyData* tmp_cfd; + while ((tmp_cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) { + cfds.push_back(tmp_cfd); + } + MaybeFlushStatsCF(&cfds); + } + Status status; + WriteThread::Writer nonmem_w; + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + } + + for (auto& cfd : cfds) { + if (!cfd->mem()->IsEmpty()) { + status = SwitchMemtable(cfd, context); + } + if (cfd->UnrefAndTryDelete()) { + cfd = nullptr; + } + if (!status.ok()) { + break; + } + } + + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } + + if (status.ok()) { + if (immutable_db_options_.atomic_flush) { + AssignAtomicFlushSeq(cfds); + } + FlushRequest flush_req; + GenerateFlushRequest(cfds, &flush_req); + SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull); + MaybeScheduleFlushOrCompaction(); + } + return status; +} + +#ifndef ROCKSDB_LITE +void DBImpl::NotifyOnMemTableSealed(ColumnFamilyData* /*cfd*/, + const MemTableInfo& mem_table_info) { + if (immutable_db_options_.listeners.size() == 0U) { + return; + } + if (shutting_down_.load(std::memory_order_acquire)) { + return; + } + + for (auto listener : immutable_db_options_.listeners) { + listener->OnMemTableSealed(mem_table_info); + } +} +#endif // ROCKSDB_LITE + +// REQUIRES: mutex_ is held +// REQUIRES: this thread is currently at the front of the writer queue +// REQUIRES: this thread is currently at the front of the 2nd writer queue if +// two_write_queues_ is true (This is to simplify the reasoning.) +Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { + mutex_.AssertHeld(); + WriteThread::Writer nonmem_w; + std::unique_ptr lfile; + log::Writer* new_log = nullptr; + MemTable* new_mem = nullptr; + + // Recoverable state is persisted in WAL. After memtable switch, WAL might + // be deleted, so we write the state to memtable to be persisted as well. + Status s = WriteRecoverableState(); + if (!s.ok()) { + return s; + } + + // Attempt to switch to a new memtable and trigger flush of old. + // Do this without holding the dbmutex lock. + assert(versions_->prev_log_number() == 0); + if (two_write_queues_) { + log_write_mutex_.Lock(); + } + bool creating_new_log = !log_empty_; + if (two_write_queues_) { + log_write_mutex_.Unlock(); + } + uint64_t recycle_log_number = 0; + if (creating_new_log && immutable_db_options_.recycle_log_file_num && + !log_recycle_files_.empty()) { + recycle_log_number = log_recycle_files_.front(); + } + uint64_t new_log_number = + creating_new_log ? versions_->NewFileNumber() : logfile_number_; + const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions(); + + // Set memtable_info for memtable sealed callback +#ifndef ROCKSDB_LITE + MemTableInfo memtable_info; + memtable_info.cf_name = cfd->GetName(); + memtable_info.first_seqno = cfd->mem()->GetFirstSequenceNumber(); + memtable_info.earliest_seqno = cfd->mem()->GetEarliestSequenceNumber(); + memtable_info.num_entries = cfd->mem()->num_entries(); + memtable_info.num_deletes = cfd->mem()->num_deletes(); +#endif // ROCKSDB_LITE + // Log this later after lock release. It may be outdated, e.g., if background + // flush happens before logging, but that should be ok. + int num_imm_unflushed = cfd->imm()->NumNotFlushed(); + const auto preallocate_block_size = + GetWalPreallocateBlockSize(mutable_cf_options.write_buffer_size); + mutex_.Unlock(); + if (creating_new_log) { + // TODO: Write buffer size passed in should be max of all CF's instead + // of mutable_cf_options.write_buffer_size. + s = CreateWAL(new_log_number, recycle_log_number, preallocate_block_size, + &new_log); + } + if (s.ok()) { + SequenceNumber seq = versions_->LastSequence(); + new_mem = cfd->ConstructNewMemtable(mutable_cf_options, seq); + context->superversion_context.NewSuperVersion(); + } + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[%s] New memtable created with log file: #%" PRIu64 + ". Immutable memtables: %d.\n", + cfd->GetName().c_str(), new_log_number, num_imm_unflushed); + mutex_.Lock(); + if (recycle_log_number != 0) { + // Since renaming the file is done outside DB mutex, we need to ensure + // concurrent full purges don't delete the file while we're recycling it. + // To achieve that we hold the old log number in the recyclable list until + // after it has been renamed. + assert(log_recycle_files_.front() == recycle_log_number); + log_recycle_files_.pop_front(); + } + if (s.ok() && creating_new_log) { + log_write_mutex_.Lock(); + assert(new_log != nullptr); + if (!logs_.empty()) { + // Alway flush the buffer of the last log before switching to a new one + log::Writer* cur_log_writer = logs_.back().writer; + s = cur_log_writer->WriteBuffer(); + if (!s.ok()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "[%s] Failed to switch from #%" PRIu64 " to #%" PRIu64 + " WAL file\n", + cfd->GetName().c_str(), cur_log_writer->get_log_number(), + new_log_number); + } + } + if (s.ok()) { + logfile_number_ = new_log_number; + log_empty_ = true; + log_dir_synced_ = false; + logs_.emplace_back(logfile_number_, new_log); + alive_log_files_.push_back(LogFileNumberSize(logfile_number_)); + } + log_write_mutex_.Unlock(); + } + + if (!s.ok()) { + // how do we fail if we're not creating new log? + assert(creating_new_log); + if (new_mem) { + delete new_mem; + } + if (new_log) { + delete new_log; + } + SuperVersion* new_superversion = + context->superversion_context.new_superversion.release(); + if (new_superversion != nullptr) { + delete new_superversion; + } + // We may have lost data from the WritableFileBuffer in-memory buffer for + // the current log, so treat it as a fatal error and set bg_error + error_handler_.SetBGError(s, BackgroundErrorReason::kMemTable); + // Read back bg_error in order to get the right severity + s = error_handler_.GetBGError(); + return s; + } + + for (auto loop_cfd : *versions_->GetColumnFamilySet()) { + // all this is just optimization to delete logs that + // are no longer needed -- if CF is empty, that means it + // doesn't need that particular log to stay alive, so we just + // advance the log number. no need to persist this in the manifest + if (loop_cfd->mem()->GetFirstSequenceNumber() == 0 && + loop_cfd->imm()->NumNotFlushed() == 0) { + if (creating_new_log) { + loop_cfd->SetLogNumber(logfile_number_); + } + loop_cfd->mem()->SetCreationSeq(versions_->LastSequence()); + } + } + + cfd->mem()->SetNextLogNumber(logfile_number_); + cfd->imm()->Add(cfd->mem(), &context->memtables_to_free_); + new_mem->Ref(); + cfd->SetMemtable(new_mem); + InstallSuperVersionAndScheduleWork(cfd, &context->superversion_context, + mutable_cf_options); +#ifndef ROCKSDB_LITE + mutex_.Unlock(); + // Notify client that memtable is sealed, now that we have successfully + // installed a new memtable + NotifyOnMemTableSealed(cfd, memtable_info); + mutex_.Lock(); +#endif // ROCKSDB_LITE + return s; +} + +size_t DBImpl::GetWalPreallocateBlockSize(uint64_t write_buffer_size) const { + mutex_.AssertHeld(); + size_t bsize = + static_cast(write_buffer_size / 10 + write_buffer_size); + // Some users might set very high write_buffer_size and rely on + // max_total_wal_size or other parameters to control the WAL size. + if (mutable_db_options_.max_total_wal_size > 0) { + bsize = std::min( + bsize, static_cast(mutable_db_options_.max_total_wal_size)); + } + if (immutable_db_options_.db_write_buffer_size > 0) { + bsize = std::min(bsize, immutable_db_options_.db_write_buffer_size); + } + if (immutable_db_options_.write_buffer_manager && + immutable_db_options_.write_buffer_manager->enabled()) { + bsize = std::min( + bsize, immutable_db_options_.write_buffer_manager->buffer_size()); + } + + return bsize; +} + +// Default implementations of convenience methods that subclasses of DB +// can call if they wish +Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { + if (nullptr == opt.timestamp) { + // Pre-allocate size of write batch conservatively. + // 8 bytes are taken by header, 4 bytes for count, 1 byte for type, + // and we allocate 11 extra bytes for key length, as well as value length. + WriteBatch batch(key.size() + value.size() + 24); + Status s = batch.Put(column_family, key, value); + if (!s.ok()) { + return s; + } + return Write(opt, &batch); + } + const Slice* ts = opt.timestamp; + assert(nullptr != ts); + size_t ts_sz = ts->size(); + WriteBatch batch(key.size() + ts_sz + value.size() + 24, /*max_bytes=*/0, + ts_sz); + Status s = batch.Put(column_family, key, value); + if (!s.ok()) { + return s; + } + s = batch.AssignTimestamp(*ts); + if (!s.ok()) { + return s; + } + return Write(opt, &batch); +} + +Status DB::Delete(const WriteOptions& opt, ColumnFamilyHandle* column_family, + const Slice& key) { + WriteBatch batch; + batch.Delete(column_family, key); + return Write(opt, &batch); +} + +Status DB::SingleDelete(const WriteOptions& opt, + ColumnFamilyHandle* column_family, const Slice& key) { + WriteBatch batch; + batch.SingleDelete(column_family, key); + return Write(opt, &batch); +} + +Status DB::DeleteRange(const WriteOptions& opt, + ColumnFamilyHandle* column_family, + const Slice& begin_key, const Slice& end_key) { + WriteBatch batch; + batch.DeleteRange(column_family, begin_key, end_key); + return Write(opt, &batch); +} + +Status DB::Merge(const WriteOptions& opt, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { + WriteBatch batch; + Status s = batch.Merge(column_family, key, value); + if (!s.ok()) { + return s; + } + return Write(opt, &batch); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_impl/db_secondary_test.cc b/src/rocksdb/db/db_impl/db_secondary_test.cc new file mode 100644 index 000000000..0b34181de --- /dev/null +++ b/src/rocksdb/db/db_impl/db_secondary_test.cc @@ -0,0 +1,869 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_impl/db_impl_secondary.h" +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "test_util/fault_injection_test_env.h" +#include "test_util/sync_point.h" + +namespace ROCKSDB_NAMESPACE { + +#ifndef ROCKSDB_LITE +class DBSecondaryTest : public DBTestBase { + public: + DBSecondaryTest() + : DBTestBase("/db_secondary_test"), + secondary_path_(), + handles_secondary_(), + db_secondary_(nullptr) { + secondary_path_ = + test::PerThreadDBPath(env_, "/db_secondary_test_secondary"); + } + + ~DBSecondaryTest() override { + CloseSecondary(); + if (getenv("KEEP_DB") != nullptr) { + fprintf(stdout, "Secondary DB is still at %s\n", secondary_path_.c_str()); + } else { + Options options; + options.env = env_; + EXPECT_OK(DestroyDB(secondary_path_, options)); + } + } + + protected: + Status ReopenAsSecondary(const Options& options) { + return DB::OpenAsSecondary(options, dbname_, secondary_path_, &db_); + } + + void OpenSecondary(const Options& options); + + void OpenSecondaryWithColumnFamilies( + const std::vector& column_families, const Options& options); + + void CloseSecondary() { + for (auto h : handles_secondary_) { + db_secondary_->DestroyColumnFamilyHandle(h); + } + handles_secondary_.clear(); + delete db_secondary_; + db_secondary_ = nullptr; + } + + DBImplSecondary* db_secondary_full() { + return static_cast(db_secondary_); + } + + void CheckFileTypeCounts(const std::string& dir, int expected_log, + int expected_sst, int expected_manifest) const; + + std::string secondary_path_; + std::vector handles_secondary_; + DB* db_secondary_; +}; + +void DBSecondaryTest::OpenSecondary(const Options& options) { + Status s = + DB::OpenAsSecondary(options, dbname_, secondary_path_, &db_secondary_); + ASSERT_OK(s); +} + +void DBSecondaryTest::OpenSecondaryWithColumnFamilies( + const std::vector& column_families, const Options& options) { + std::vector cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, options); + for (const auto& cf_name : column_families) { + cf_descs.emplace_back(cf_name, options); + } + Status s = DB::OpenAsSecondary(options, dbname_, secondary_path_, cf_descs, + &handles_secondary_, &db_secondary_); + ASSERT_OK(s); +} + +void DBSecondaryTest::CheckFileTypeCounts(const std::string& dir, + int expected_log, int expected_sst, + int expected_manifest) const { + std::vector filenames; + env_->GetChildren(dir, &filenames); + + int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0; + for (auto file : filenames) { + uint64_t number; + FileType type; + if (ParseFileName(file, &number, &type)) { + log_cnt += (type == kLogFile); + sst_cnt += (type == kTableFile); + manifest_cnt += (type == kDescriptorFile); + } + } + ASSERT_EQ(expected_log, log_cnt); + ASSERT_EQ(expected_sst, sst_cnt); + ASSERT_EQ(expected_manifest, manifest_cnt); +} + +TEST_F(DBSecondaryTest, ReopenAsSecondary) { + Options options; + options.env = env_; + Reopen(options); + ASSERT_OK(Put("foo", "foo_value")); + ASSERT_OK(Put("bar", "bar_value")); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + Close(); + + ASSERT_OK(ReopenAsSecondary(options)); + ASSERT_EQ("foo_value", Get("foo")); + ASSERT_EQ("bar_value", Get("bar")); + ReadOptions ropts; + ropts.verify_checksums = true; + auto db1 = static_cast(db_); + ASSERT_NE(nullptr, db1); + Iterator* iter = db1->NewIterator(ropts); + ASSERT_NE(nullptr, iter); + size_t count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + if (0 == count) { + ASSERT_EQ("bar", iter->key().ToString()); + ASSERT_EQ("bar_value", iter->value().ToString()); + } else if (1 == count) { + ASSERT_EQ("foo", iter->key().ToString()); + ASSERT_EQ("foo_value", iter->value().ToString()); + } + ++count; + } + delete iter; + ASSERT_EQ(2, count); +} + +TEST_F(DBSecondaryTest, OpenAsSecondary) { + Options options; + options.env = env_; + options.level0_file_num_compaction_trigger = 4; + Reopen(options); + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); + ASSERT_OK(Put("bar", "bar_value" + std::to_string(i))); + ASSERT_OK(Flush()); + } + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ReadOptions ropts; + ropts.verify_checksums = true; + const auto verify_db_func = [&](const std::string& foo_val, + const std::string& bar_val) { + std::string value; + ASSERT_OK(db_secondary_->Get(ropts, "foo", &value)); + ASSERT_EQ(foo_val, value); + ASSERT_OK(db_secondary_->Get(ropts, "bar", &value)); + ASSERT_EQ(bar_val, value); + Iterator* iter = db_secondary_->NewIterator(ropts); + ASSERT_NE(nullptr, iter); + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + ASSERT_EQ(foo_val, iter->value().ToString()); + iter->Seek("bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bar", iter->key().ToString()); + ASSERT_EQ(bar_val, iter->value().ToString()); + size_t count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ++count; + } + ASSERT_EQ(2, count); + delete iter; + }; + + verify_db_func("foo_value2", "bar_value2"); + + ASSERT_OK(Put("foo", "new_foo_value")); + ASSERT_OK(Put("bar", "new_bar_value")); + ASSERT_OK(Flush()); + + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db_func("new_foo_value", "new_bar_value"); +} + +namespace { +class TraceFileEnv : public EnvWrapper { + public: + explicit TraceFileEnv(Env* _target) : EnvWrapper(_target) {} + Status NewRandomAccessFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& env_options) override { + class TracedRandomAccessFile : public RandomAccessFile { + public: + TracedRandomAccessFile(std::unique_ptr&& target, + std::atomic& counter) + : target_(std::move(target)), files_closed_(counter) {} + ~TracedRandomAccessFile() override { + files_closed_.fetch_add(1, std::memory_order_relaxed); + } + Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override { + return target_->Read(offset, n, result, scratch); + } + + private: + std::unique_ptr target_; + std::atomic& files_closed_; + }; + Status s = target()->NewRandomAccessFile(f, r, env_options); + if (s.ok()) { + r->reset(new TracedRandomAccessFile(std::move(*r), files_closed_)); + } + return s; + } + + int files_closed() const { + return files_closed_.load(std::memory_order_relaxed); + } + + private: + std::atomic files_closed_{0}; +}; +} // namespace + +TEST_F(DBSecondaryTest, SecondaryCloseFiles) { + Options options; + options.env = env_; + options.max_open_files = 1; + options.disable_auto_compactions = true; + Reopen(options); + Options options1; + std::unique_ptr traced_env(new TraceFileEnv(env_)); + options1.env = traced_env.get(); + OpenSecondary(options1); + + static const auto verify_db = [&]() { + std::unique_ptr iter1(dbfull()->NewIterator(ReadOptions())); + std::unique_ptr iter2(db_secondary_->NewIterator(ReadOptions())); + for (iter1->SeekToFirst(), iter2->SeekToFirst(); + iter1->Valid() && iter2->Valid(); iter1->Next(), iter2->Next()) { + ASSERT_EQ(iter1->key(), iter2->key()); + ASSERT_EQ(iter1->value(), iter2->value()); + } + ASSERT_FALSE(iter1->Valid()); + ASSERT_FALSE(iter2->Valid()); + }; + + ASSERT_OK(Put("a", "value")); + ASSERT_OK(Put("c", "value")); + ASSERT_OK(Flush()); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db(); + + ASSERT_OK(Put("b", "value")); + ASSERT_OK(Put("d", "value")); + ASSERT_OK(Flush()); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db(); + + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + ASSERT_EQ(2, static_cast(traced_env.get())->files_closed()); + + Status s = db_secondary_->SetDBOptions({{"max_open_files", "-1"}}); + ASSERT_TRUE(s.IsNotSupported()); + CloseSecondary(); +} + +TEST_F(DBSecondaryTest, OpenAsSecondaryWALTailing) { + Options options; + options.env = env_; + options.level0_file_num_compaction_trigger = 4; + Reopen(options); + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); + ASSERT_OK(Put("bar", "bar_value" + std::to_string(i))); + } + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + + ReadOptions ropts; + ropts.verify_checksums = true; + const auto verify_db_func = [&](const std::string& foo_val, + const std::string& bar_val) { + std::string value; + ASSERT_OK(db_secondary_->Get(ropts, "foo", &value)); + ASSERT_EQ(foo_val, value); + ASSERT_OK(db_secondary_->Get(ropts, "bar", &value)); + ASSERT_EQ(bar_val, value); + Iterator* iter = db_secondary_->NewIterator(ropts); + ASSERT_NE(nullptr, iter); + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + ASSERT_EQ(foo_val, iter->value().ToString()); + iter->Seek("bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bar", iter->key().ToString()); + ASSERT_EQ(bar_val, iter->value().ToString()); + size_t count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ++count; + } + ASSERT_EQ(2, count); + delete iter; + }; + + verify_db_func("foo_value2", "bar_value2"); + + ASSERT_OK(Put("foo", "new_foo_value")); + ASSERT_OK(Put("bar", "new_bar_value")); + + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db_func("new_foo_value", "new_bar_value"); + + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "new_foo_value_1")); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db_func("new_foo_value_1", "new_bar_value"); +} + +TEST_F(DBSecondaryTest, OpenWithNonExistColumnFamily) { + Options options; + options.env = env_; + CreateAndReopenWithCF({"pikachu"}, options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + std::vector cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, options1); + cf_descs.emplace_back("pikachu", options1); + cf_descs.emplace_back("eevee", options1); + Status s = DB::OpenAsSecondary(options1, dbname_, secondary_path_, cf_descs, + &handles_secondary_, &db_secondary_); + ASSERT_NOK(s); +} + +TEST_F(DBSecondaryTest, OpenWithSubsetOfColumnFamilies) { + Options options; + options.env = env_; + CreateAndReopenWithCF({"pikachu"}, options); + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + ASSERT_EQ(0, handles_secondary_.size()); + ASSERT_NE(nullptr, db_secondary_); + + ASSERT_OK(Put(0 /*cf*/, "foo", "foo_value")); + ASSERT_OK(Put(1 /*cf*/, "foo", "foo_value")); + ASSERT_OK(Flush(0 /*cf*/)); + ASSERT_OK(Flush(1 /*cf*/)); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + ReadOptions ropts; + ropts.verify_checksums = true; + std::string value; + ASSERT_OK(db_secondary_->Get(ropts, "foo", &value)); + ASSERT_EQ("foo_value", value); +} + +TEST_F(DBSecondaryTest, SwitchToNewManifestDuringOpen) { + Options options; + options.env = env_; + Reopen(options); + Close(); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->LoadDependency( + {{"ReactiveVersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:0", + "VersionSet::ProcessManifestWrites:BeforeNewManifest"}, + {"VersionSet::ProcessManifestWrites:AfterNewManifest", + "ReactiveVersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:" + "1"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + // Make sure db calls RecoverLogFiles so as to trigger a manifest write, + // which causes the db to switch to a new MANIFEST upon start. + port::Thread ro_db_thread([&]() { + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + CloseSecondary(); + }); + Reopen(options); + ro_db_thread.join(); +} + +TEST_F(DBSecondaryTest, MissingTableFileDuringOpen) { + Options options; + options.env = env_; + options.level0_file_num_compaction_trigger = 4; + Reopen(options); + for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) { + ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); + ASSERT_OK(Put("bar", "bar_value" + std::to_string(i))); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + ReadOptions ropts; + ropts.verify_checksums = true; + std::string value; + ASSERT_OK(db_secondary_->Get(ropts, "foo", &value)); + ASSERT_EQ("foo_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + value); + ASSERT_OK(db_secondary_->Get(ropts, "bar", &value)); + ASSERT_EQ("bar_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + value); + Iterator* iter = db_secondary_->NewIterator(ropts); + ASSERT_NE(nullptr, iter); + iter->Seek("bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bar", iter->key().ToString()); + ASSERT_EQ("bar_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + iter->value().ToString()); + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + ASSERT_EQ("foo_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + iter->value().ToString()); + size_t count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ++count; + } + ASSERT_EQ(2, count); + delete iter; +} + +TEST_F(DBSecondaryTest, MissingTableFile) { + int table_files_not_exist = 0; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "ReactiveVersionSet::ApplyOneVersionEditToBuilder:AfterLoadTableHandlers", + [&](void* arg) { + Status s = *reinterpret_cast(arg); + if (s.IsPathNotFound()) { + ++table_files_not_exist; + } else if (!s.ok()) { + assert(false); // Should not reach here + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + Options options; + options.env = env_; + options.level0_file_num_compaction_trigger = 4; + Reopen(options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + + for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) { + ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); + ASSERT_OK(Put("bar", "bar_value" + std::to_string(i))); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_NE(nullptr, db_secondary_full()); + ReadOptions ropts; + ropts.verify_checksums = true; + std::string value; + ASSERT_NOK(db_secondary_->Get(ropts, "foo", &value)); + ASSERT_NOK(db_secondary_->Get(ropts, "bar", &value)); + + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + ASSERT_EQ(options.level0_file_num_compaction_trigger, table_files_not_exist); + ASSERT_OK(db_secondary_->Get(ropts, "foo", &value)); + ASSERT_EQ("foo_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + value); + ASSERT_OK(db_secondary_->Get(ropts, "bar", &value)); + ASSERT_EQ("bar_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + value); + Iterator* iter = db_secondary_->NewIterator(ropts); + ASSERT_NE(nullptr, iter); + iter->Seek("bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bar", iter->key().ToString()); + ASSERT_EQ("bar_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + iter->value().ToString()); + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + ASSERT_EQ("foo_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + iter->value().ToString()); + size_t count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ++count; + } + ASSERT_EQ(2, count); + delete iter; +} + +TEST_F(DBSecondaryTest, PrimaryDropColumnFamily) { + Options options; + options.env = env_; + const std::string kCfName1 = "pikachu"; + CreateAndReopenWithCF({kCfName1}, options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondaryWithColumnFamilies({kCfName1}, options1); + ASSERT_EQ(2, handles_secondary_.size()); + + ASSERT_OK(Put(1 /*cf*/, "foo", "foo_val_1")); + ASSERT_OK(Flush(1 /*cf*/)); + + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + ReadOptions ropts; + ropts.verify_checksums = true; + std::string value; + ASSERT_OK(db_secondary_->Get(ropts, handles_secondary_[1], "foo", &value)); + ASSERT_EQ("foo_val_1", value); + + ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); + Close(); + CheckFileTypeCounts(dbname_, 1, 0, 1); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + value.clear(); + ASSERT_OK(db_secondary_->Get(ropts, handles_secondary_[1], "foo", &value)); + ASSERT_EQ("foo_val_1", value); +} + +TEST_F(DBSecondaryTest, SwitchManifest) { + Options options; + options.env = env_; + options.level0_file_num_compaction_trigger = 4; + Reopen(options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + + const int kNumFiles = options.level0_file_num_compaction_trigger - 1; + // Keep it smaller than 10 so that key0, key1, ..., key9 are sorted as 0, 1, + // ..., 9. + const int kNumKeys = 10; + // Create two sst + for (int i = 0; i != kNumFiles; ++i) { + for (int j = 0; j != kNumKeys; ++j) { + ASSERT_OK(Put("key" + std::to_string(j), "value_" + std::to_string(i))); + } + ASSERT_OK(Flush()); + } + + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + const auto& range_scan_db = [&]() { + ReadOptions tmp_ropts; + tmp_ropts.total_order_seek = true; + tmp_ropts.verify_checksums = true; + std::unique_ptr iter(db_secondary_->NewIterator(tmp_ropts)); + int cnt = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next(), ++cnt) { + ASSERT_EQ("key" + std::to_string(cnt), iter->key().ToString()); + ASSERT_EQ("value_" + std::to_string(kNumFiles - 1), + iter->value().ToString()); + } + }; + + range_scan_db(); + + // While secondary instance still keeps old MANIFEST open, we close primary, + // restart primary, performs full compaction, close again, restart again so + // that next time secondary tries to catch up with primary, the secondary + // will skip the MANIFEST in middle. + Reopen(options); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + Reopen(options); + ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}})); + + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + range_scan_db(); +} + +// Here, "Snapshot" refers to the version edits written by +// VersionSet::WriteSnapshot() at the beginning of the new MANIFEST after +// switching from the old one. +TEST_F(DBSecondaryTest, SkipSnapshotAfterManifestSwitch) { + Options options; + options.env = env_; + options.disable_auto_compactions = true; + Reopen(options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + + ASSERT_OK(Put("0", "value0")); + ASSERT_OK(Flush()); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + std::string value; + ReadOptions ropts; + ropts.verify_checksums = true; + ASSERT_OK(db_secondary_->Get(ropts, "0", &value)); + ASSERT_EQ("value0", value); + + Reopen(options); + ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}})); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); +} + +TEST_F(DBSecondaryTest, SwitchWAL) { + const int kNumKeysPerMemtable = 1; + Options options; + options.env = env_; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 2; + options.memtable_factory.reset( + new SpecialSkipListFactory(kNumKeysPerMemtable)); + Reopen(options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + + const auto& verify_db = [](DB* db1, DB* db2) { + ASSERT_NE(nullptr, db1); + ASSERT_NE(nullptr, db2); + ReadOptions read_opts; + read_opts.verify_checksums = true; + std::unique_ptr it1(db1->NewIterator(read_opts)); + std::unique_ptr it2(db2->NewIterator(read_opts)); + it1->SeekToFirst(); + it2->SeekToFirst(); + for (; it1->Valid() && it2->Valid(); it1->Next(), it2->Next()) { + ASSERT_EQ(it1->key(), it2->key()); + ASSERT_EQ(it1->value(), it2->value()); + } + ASSERT_FALSE(it1->Valid()); + ASSERT_FALSE(it2->Valid()); + + for (it1->SeekToFirst(); it1->Valid(); it1->Next()) { + std::string value; + ASSERT_OK(db2->Get(read_opts, it1->key(), &value)); + ASSERT_EQ(it1->value(), value); + } + for (it2->SeekToFirst(); it2->Valid(); it2->Next()) { + std::string value; + ASSERT_OK(db1->Get(read_opts, it2->key(), &value)); + ASSERT_EQ(it2->value(), value); + } + }; + for (int k = 0; k != 16; ++k) { + ASSERT_OK(Put("key" + std::to_string(k), "value" + std::to_string(k))); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db(dbfull(), db_secondary_); + } +} + +TEST_F(DBSecondaryTest, SwitchWALMultiColumnFamilies) { + const int kNumKeysPerMemtable = 1; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BackgroundCallFlush:ContextCleanedUp", + "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp"}}); + SyncPoint::GetInstance()->EnableProcessing(); + const std::string kCFName1 = "pikachu"; + Options options; + options.env = env_; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 2; + options.memtable_factory.reset( + new SpecialSkipListFactory(kNumKeysPerMemtable)); + CreateAndReopenWithCF({kCFName1}, options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondaryWithColumnFamilies({kCFName1}, options1); + ASSERT_EQ(2, handles_secondary_.size()); + + const auto& verify_db = [](DB* db1, + const std::vector& handles1, + DB* db2, + const std::vector& handles2) { + ASSERT_NE(nullptr, db1); + ASSERT_NE(nullptr, db2); + ReadOptions read_opts; + read_opts.verify_checksums = true; + ASSERT_EQ(handles1.size(), handles2.size()); + for (size_t i = 0; i != handles1.size(); ++i) { + std::unique_ptr it1(db1->NewIterator(read_opts, handles1[i])); + std::unique_ptr it2(db2->NewIterator(read_opts, handles2[i])); + it1->SeekToFirst(); + it2->SeekToFirst(); + for (; it1->Valid() && it2->Valid(); it1->Next(), it2->Next()) { + ASSERT_EQ(it1->key(), it2->key()); + ASSERT_EQ(it1->value(), it2->value()); + } + ASSERT_FALSE(it1->Valid()); + ASSERT_FALSE(it2->Valid()); + + for (it1->SeekToFirst(); it1->Valid(); it1->Next()) { + std::string value; + ASSERT_OK(db2->Get(read_opts, handles2[i], it1->key(), &value)); + ASSERT_EQ(it1->value(), value); + } + for (it2->SeekToFirst(); it2->Valid(); it2->Next()) { + std::string value; + ASSERT_OK(db1->Get(read_opts, handles1[i], it2->key(), &value)); + ASSERT_EQ(it2->value(), value); + } + } + }; + for (int k = 0; k != 8; ++k) { + ASSERT_OK( + Put(0 /*cf*/, "key" + std::to_string(k), "value" + std::to_string(k))); + ASSERT_OK( + Put(1 /*cf*/, "key" + std::to_string(k), "value" + std::to_string(k))); + TEST_SYNC_POINT( + "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp"); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db(dbfull(), handles_, db_secondary_, handles_secondary_); + SyncPoint::GetInstance()->ClearTrace(); + } +} + +TEST_F(DBSecondaryTest, CatchUpAfterFlush) { + const int kNumKeysPerMemtable = 16; + Options options; + options.env = env_; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 2; + options.memtable_factory.reset( + new SpecialSkipListFactory(kNumKeysPerMemtable)); + Reopen(options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + + WriteOptions write_opts; + WriteBatch wb; + wb.Put("key0", "value0"); + wb.Put("key1", "value1"); + ASSERT_OK(dbfull()->Write(write_opts, &wb)); + ReadOptions read_opts; + std::unique_ptr iter1(db_secondary_->NewIterator(read_opts)); + iter1->Seek("key0"); + ASSERT_FALSE(iter1->Valid()); + iter1->Seek("key1"); + ASSERT_FALSE(iter1->Valid()); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + iter1->Seek("key0"); + ASSERT_FALSE(iter1->Valid()); + iter1->Seek("key1"); + ASSERT_FALSE(iter1->Valid()); + std::unique_ptr iter2(db_secondary_->NewIterator(read_opts)); + iter2->Seek("key0"); + ASSERT_TRUE(iter2->Valid()); + ASSERT_EQ("value0", iter2->value()); + iter2->Seek("key1"); + ASSERT_TRUE(iter2->Valid()); + ASSERT_EQ("value1", iter2->value()); + + { + WriteBatch wb1; + wb1.Put("key0", "value01"); + wb1.Put("key1", "value11"); + ASSERT_OK(dbfull()->Write(write_opts, &wb1)); + } + + { + WriteBatch wb2; + wb2.Put("key0", "new_value0"); + wb2.Delete("key1"); + ASSERT_OK(dbfull()->Write(write_opts, &wb2)); + } + + ASSERT_OK(Flush()); + + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + std::unique_ptr iter3(db_secondary_->NewIterator(read_opts)); + // iter3 should not see value01 and value11 at all. + iter3->Seek("key0"); + ASSERT_TRUE(iter3->Valid()); + ASSERT_EQ("new_value0", iter3->value()); + iter3->Seek("key1"); + ASSERT_FALSE(iter3->Valid()); +} + +TEST_F(DBSecondaryTest, CheckConsistencyWhenOpen) { + bool called = false; + Options options; + options.env = env_; + options.disable_auto_compactions = true; + Reopen(options); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "DBImplSecondary::CheckConsistency:AfterFirstAttempt", [&](void* arg) { + ASSERT_NE(nullptr, arg); + called = true; + auto* s = reinterpret_cast(arg); + ASSERT_NOK(*s); + }); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::CheckConsistency:AfterGetLiveFilesMetaData", + "BackgroundCallCompaction:0"}, + {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles", + "DBImpl::CheckConsistency:BeforeGetFileSize"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put("a", "value0")); + ASSERT_OK(Put("c", "value0")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("b", "value1")); + ASSERT_OK(Put("d", "value1")); + ASSERT_OK(Flush()); + port::Thread thread([this]() { + Options opts; + opts.env = env_; + opts.max_open_files = -1; + OpenSecondary(opts); + }); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + thread.join(); + ASSERT_TRUE(called); +} +#endif //! ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_info_dumper.cc b/src/rocksdb/db/db_info_dumper.cc new file mode 100644 index 000000000..7008ca6ff --- /dev/null +++ b/src/rocksdb/db/db_info_dumper.cc @@ -0,0 +1,123 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/db_info_dumper.h" + +#include +#include +#include +#include +#include + +#include "file/filename.h" +#include "rocksdb/env.h" + +namespace ROCKSDB_NAMESPACE { + +void DumpDBFileSummary(const ImmutableDBOptions& options, + const std::string& dbname) { + if (options.info_log == nullptr) { + return; + } + + auto* env = options.env; + uint64_t number = 0; + FileType type = kInfoLogFile; + + std::vector files; + uint64_t file_num = 0; + uint64_t file_size; + std::string file_info, wal_info; + + Header(options.info_log, "DB SUMMARY\n"); + // Get files in dbname dir + if (!env->GetChildren(dbname, &files).ok()) { + Error(options.info_log, + "Error when reading %s dir\n", dbname.c_str()); + } + std::sort(files.begin(), files.end()); + for (const std::string& file : files) { + if (!ParseFileName(file, &number, &type)) { + continue; + } + switch (type) { + case kCurrentFile: + Header(options.info_log, "CURRENT file: %s\n", file.c_str()); + break; + case kIdentityFile: + Header(options.info_log, "IDENTITY file: %s\n", file.c_str()); + break; + case kDescriptorFile: + env->GetFileSize(dbname + "/" + file, &file_size); + Header(options.info_log, "MANIFEST file: %s size: %" PRIu64 " Bytes\n", + file.c_str(), file_size); + break; + case kLogFile: + env->GetFileSize(dbname + "/" + file, &file_size); + char str[16]; + snprintf(str, sizeof(str), "%" PRIu64, file_size); + wal_info.append(file).append(" size: "). + append(str).append(" ; "); + break; + case kTableFile: + if (++file_num < 10) { + file_info.append(file).append(" "); + } + break; + default: + break; + } + } + + // Get sst files in db_path dir + for (auto& db_path : options.db_paths) { + if (dbname.compare(db_path.path) != 0) { + if (!env->GetChildren(db_path.path, &files).ok()) { + Error(options.info_log, + "Error when reading %s dir\n", + db_path.path.c_str()); + continue; + } + std::sort(files.begin(), files.end()); + for (const std::string& file : files) { + if (ParseFileName(file, &number, &type)) { + if (type == kTableFile && ++file_num < 10) { + file_info.append(file).append(" "); + } + } + } + } + Header(options.info_log, + "SST files in %s dir, Total Num: %" PRIu64 ", files: %s\n", + db_path.path.c_str(), file_num, file_info.c_str()); + file_num = 0; + file_info.clear(); + } + + // Get wal file in wal_dir + if (dbname.compare(options.wal_dir) != 0) { + if (!env->GetChildren(options.wal_dir, &files).ok()) { + Error(options.info_log, + "Error when reading %s dir\n", + options.wal_dir.c_str()); + return; + } + wal_info.clear(); + for (const std::string& file : files) { + if (ParseFileName(file, &number, &type)) { + if (type == kLogFile) { + env->GetFileSize(options.wal_dir + "/" + file, &file_size); + char str[16]; + snprintf(str, sizeof(str), "%" PRIu64, file_size); + wal_info.append(file).append(" size: "). + append(str).append(" ; "); + } + } + } + } + Header(options.info_log, "Write Ahead Log file in %s: %s\n", + options.wal_dir.c_str(), wal_info.c_str()); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_info_dumper.h b/src/rocksdb/db/db_info_dumper.h new file mode 100644 index 000000000..91404cbd7 --- /dev/null +++ b/src/rocksdb/db/db_info_dumper.h @@ -0,0 +1,14 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include + +#include "options/db_options.h" + +namespace ROCKSDB_NAMESPACE { +void DumpDBFileSummary(const ImmutableDBOptions& options, + const std::string& dbname); +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_inplace_update_test.cc b/src/rocksdb/db/db_inplace_update_test.cc new file mode 100644 index 000000000..26405864e --- /dev/null +++ b/src/rocksdb/db/db_inplace_update_test.cc @@ -0,0 +1,177 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "db/db_test_util.h" +#include "port/stack_trace.h" + +namespace ROCKSDB_NAMESPACE { + +class DBTestInPlaceUpdate : public DBTestBase { + public: + DBTestInPlaceUpdate() : DBTestBase("/db_inplace_update_test") {} +}; + +TEST_F(DBTestInPlaceUpdate, InPlaceUpdate) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.inplace_update_support = true; + options.env = env_; + options.write_buffer_size = 100000; + options.allow_concurrent_memtable_write = false; + Reopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Update key with values of smaller size + int numValues = 10; + for (int i = numValues; i > 0; i--) { + std::string value = DummyString(i, 'a'); + ASSERT_OK(Put(1, "key", value)); + ASSERT_EQ(value, Get(1, "key")); + } + + // Only 1 instance for that key. + validateNumberOfEntries(1, 1); + } while (ChangeCompactOptions()); +} + +TEST_F(DBTestInPlaceUpdate, InPlaceUpdateLargeNewValue) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.inplace_update_support = true; + options.env = env_; + options.write_buffer_size = 100000; + options.allow_concurrent_memtable_write = false; + Reopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Update key with values of larger size + int numValues = 10; + for (int i = 0; i < numValues; i++) { + std::string value = DummyString(i, 'a'); + ASSERT_OK(Put(1, "key", value)); + ASSERT_EQ(value, Get(1, "key")); + } + + // All 10 updates exist in the internal iterator + validateNumberOfEntries(numValues, 1); + } while (ChangeCompactOptions()); +} + +TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackSmallerSize) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.inplace_update_support = true; + + options.env = env_; + options.write_buffer_size = 100000; + options.inplace_callback = + ROCKSDB_NAMESPACE::DBTestInPlaceUpdate::updateInPlaceSmallerSize; + options.allow_concurrent_memtable_write = false; + Reopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Update key with values of smaller size + int numValues = 10; + ASSERT_OK(Put(1, "key", DummyString(numValues, 'a'))); + ASSERT_EQ(DummyString(numValues, 'c'), Get(1, "key")); + + for (int i = numValues; i > 0; i--) { + ASSERT_OK(Put(1, "key", DummyString(i, 'a'))); + ASSERT_EQ(DummyString(i - 1, 'b'), Get(1, "key")); + } + + // Only 1 instance for that key. + validateNumberOfEntries(1, 1); + } while (ChangeCompactOptions()); +} + +TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackSmallerVarintSize) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.inplace_update_support = true; + + options.env = env_; + options.write_buffer_size = 100000; + options.inplace_callback = + ROCKSDB_NAMESPACE::DBTestInPlaceUpdate::updateInPlaceSmallerVarintSize; + options.allow_concurrent_memtable_write = false; + Reopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Update key with values of smaller varint size + int numValues = 265; + ASSERT_OK(Put(1, "key", DummyString(numValues, 'a'))); + ASSERT_EQ(DummyString(numValues, 'c'), Get(1, "key")); + + for (int i = numValues; i > 0; i--) { + ASSERT_OK(Put(1, "key", DummyString(i, 'a'))); + ASSERT_EQ(DummyString(1, 'b'), Get(1, "key")); + } + + // Only 1 instance for that key. + validateNumberOfEntries(1, 1); + } while (ChangeCompactOptions()); +} + +TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackLargeNewValue) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.inplace_update_support = true; + + options.env = env_; + options.write_buffer_size = 100000; + options.inplace_callback = + ROCKSDB_NAMESPACE::DBTestInPlaceUpdate::updateInPlaceLargerSize; + options.allow_concurrent_memtable_write = false; + Reopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Update key with values of larger size + int numValues = 10; + for (int i = 0; i < numValues; i++) { + ASSERT_OK(Put(1, "key", DummyString(i, 'a'))); + ASSERT_EQ(DummyString(i, 'c'), Get(1, "key")); + } + + // No inplace updates. All updates are puts with new seq number + // All 10 updates exist in the internal iterator + validateNumberOfEntries(numValues, 1); + } while (ChangeCompactOptions()); +} + +TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackNoAction) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.inplace_update_support = true; + + options.env = env_; + options.write_buffer_size = 100000; + options.inplace_callback = + ROCKSDB_NAMESPACE::DBTestInPlaceUpdate::updateInPlaceNoAction; + options.allow_concurrent_memtable_write = false; + Reopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Callback function requests no actions from db + ASSERT_OK(Put(1, "key", DummyString(1, 'a'))); + ASSERT_EQ(Get(1, "key"), "NOT_FOUND"); + } while (ChangeCompactOptions()); +} +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_io_failure_test.cc b/src/rocksdb/db/db_io_failure_test.cc new file mode 100644 index 000000000..f8d562447 --- /dev/null +++ b/src/rocksdb/db/db_io_failure_test.cc @@ -0,0 +1,568 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_test_util.h" +#include "port/stack_trace.h" + +namespace ROCKSDB_NAMESPACE { + +class DBIOFailureTest : public DBTestBase { + public: + DBIOFailureTest() : DBTestBase("/db_io_failure_test") {} +}; + +#ifndef ROCKSDB_LITE +// Check that number of files does not grow when writes are dropped +TEST_F(DBIOFailureTest, DropWrites) { + do { + Options options = CurrentOptions(); + options.env = env_; + options.paranoid_checks = false; + Reopen(options); + + ASSERT_OK(Put("foo", "v1")); + ASSERT_EQ("v1", Get("foo")); + Compact("a", "z"); + const size_t num_files = CountFiles(); + // Force out-of-space errors + env_->drop_writes_.store(true, std::memory_order_release); + env_->sleep_counter_.Reset(); + env_->no_slowdown_ = true; + for (int i = 0; i < 5; i++) { + if (option_config_ != kUniversalCompactionMultiLevel && + option_config_ != kUniversalSubcompactions) { + for (int level = 0; level < dbfull()->NumberLevels(); level++) { + if (level > 0 && level == dbfull()->NumberLevels() - 1) { + break; + } + dbfull()->TEST_CompactRange(level, nullptr, nullptr, nullptr, + true /* disallow trivial move */); + } + } else { + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + } + } + + std::string property_value; + ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value)); + ASSERT_EQ("5", property_value); + + env_->drop_writes_.store(false, std::memory_order_release); + ASSERT_LT(CountFiles(), num_files + 3); + + // Check that compaction attempts slept after errors + // TODO @krad: Figure out why ASSERT_EQ 5 keeps failing in certain compiler + // versions + ASSERT_GE(env_->sleep_counter_.Read(), 4); + } while (ChangeCompactOptions()); +} + +// Check background error counter bumped on flush failures. +TEST_F(DBIOFailureTest, DropWritesFlush) { + do { + Options options = CurrentOptions(); + options.env = env_; + options.max_background_flushes = 1; + Reopen(options); + + ASSERT_OK(Put("foo", "v1")); + // Force out-of-space errors + env_->drop_writes_.store(true, std::memory_order_release); + + std::string property_value; + // Background error count is 0 now. + ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value)); + ASSERT_EQ("0", property_value); + + dbfull()->TEST_FlushMemTable(true); + + ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value)); + ASSERT_EQ("1", property_value); + + env_->drop_writes_.store(false, std::memory_order_release); + } while (ChangeCompactOptions()); +} + +// Check that CompactRange() returns failure if there is not enough space left +// on device +TEST_F(DBIOFailureTest, NoSpaceCompactRange) { + do { + Options options = CurrentOptions(); + options.env = env_; + options.disable_auto_compactions = true; + Reopen(options); + + // generate 5 tables + for (int i = 0; i < 5; ++i) { + ASSERT_OK(Put(Key(i), Key(i) + "v")); + ASSERT_OK(Flush()); + } + + // Force out-of-space errors + env_->no_space_.store(true, std::memory_order_release); + + Status s = dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow trivial move */); + ASSERT_TRUE(s.IsIOError()); + ASSERT_TRUE(s.IsNoSpace()); + + env_->no_space_.store(false, std::memory_order_release); + } while (ChangeCompactOptions()); +} +#endif // ROCKSDB_LITE + +TEST_F(DBIOFailureTest, NonWritableFileSystem) { + do { + Options options = CurrentOptions(); + options.write_buffer_size = 4096; + options.arena_block_size = 4096; + options.env = env_; + Reopen(options); + ASSERT_OK(Put("foo", "v1")); + env_->non_writeable_rate_.store(100); + std::string big(100000, 'x'); + int errors = 0; + for (int i = 0; i < 20; i++) { + if (!Put("foo", big).ok()) { + errors++; + env_->SleepForMicroseconds(100000); + } + } + ASSERT_GT(errors, 0); + env_->non_writeable_rate_.store(0); + } while (ChangeCompactOptions()); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBIOFailureTest, ManifestWriteError) { + // Test for the following problem: + // (a) Compaction produces file F + // (b) Log record containing F is written to MANIFEST file, but Sync() fails + // (c) GC deletes F + // (d) After reopening DB, reads fail since deleted F is named in log record + + // We iterate twice. In the second iteration, everything is the + // same except the log record never makes it to the MANIFEST file. + for (int iter = 0; iter < 2; iter++) { + std::atomic* error_type = (iter == 0) ? &env_->manifest_sync_error_ + : &env_->manifest_write_error_; + + // Insert foo=>bar mapping + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.error_if_exists = false; + options.paranoid_checks = true; + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "bar")); + ASSERT_EQ("bar", Get("foo")); + + // Memtable compaction (will succeed) + Flush(); + ASSERT_EQ("bar", Get("foo")); + const int last = 2; + MoveFilesToLevel(2); + ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo=>bar is now in last level + + // Merging compaction (will fail) + error_type->store(true, std::memory_order_release); + dbfull()->TEST_CompactRange(last, nullptr, nullptr); // Should fail + ASSERT_EQ("bar", Get("foo")); + + error_type->store(false, std::memory_order_release); + + // Since paranoid_checks=true, writes should fail + ASSERT_NOK(Put("foo2", "bar2")); + + // Recovery: should not lose data + ASSERT_EQ("bar", Get("foo")); + + // Try again with paranoid_checks=false + Close(); + options.paranoid_checks = false; + Reopen(options); + + // Merging compaction (will fail) + error_type->store(true, std::memory_order_release); + dbfull()->TEST_CompactRange(last, nullptr, nullptr); // Should fail + ASSERT_EQ("bar", Get("foo")); + + // Recovery: should not lose data + error_type->store(false, std::memory_order_release); + Reopen(options); + ASSERT_EQ("bar", Get("foo")); + + // Since paranoid_checks=false, writes should succeed + ASSERT_OK(Put("foo2", "bar2")); + ASSERT_EQ("bar", Get("foo")); + ASSERT_EQ("bar2", Get("foo2")); + } +} + +TEST_F(DBIOFailureTest, PutFailsParanoid) { + // Test the following: + // (a) A random put fails in paranoid mode (simulate by sync fail) + // (b) All other puts have to fail, even if writes would succeed + // (c) All of that should happen ONLY if paranoid_checks = true + + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.error_if_exists = false; + options.paranoid_checks = true; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + Status s; + + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Put(1, "foo1", "bar1")); + // simulate error + env_->log_write_error_.store(true, std::memory_order_release); + s = Put(1, "foo2", "bar2"); + ASSERT_TRUE(!s.ok()); + env_->log_write_error_.store(false, std::memory_order_release); + s = Put(1, "foo3", "bar3"); + // the next put should fail, too + ASSERT_TRUE(!s.ok()); + // but we're still able to read + ASSERT_EQ("bar", Get(1, "foo")); + + // do the same thing with paranoid checks off + options.paranoid_checks = false; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Put(1, "foo1", "bar1")); + // simulate error + env_->log_write_error_.store(true, std::memory_order_release); + s = Put(1, "foo2", "bar2"); + ASSERT_TRUE(!s.ok()); + env_->log_write_error_.store(false, std::memory_order_release); + s = Put(1, "foo3", "bar3"); + // the next put should NOT fail + ASSERT_TRUE(s.ok()); +} +#if !(defined NDEBUG) || !defined(OS_WIN) +TEST_F(DBIOFailureTest, FlushSstRangeSyncError) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.error_if_exists = false; + options.paranoid_checks = true; + options.write_buffer_size = 256 * 1024 * 1024; + options.writable_file_max_buffer_size = 128 * 1024; + options.bytes_per_sync = 128 * 1024; + options.level0_file_num_compaction_trigger = 4; + options.memtable_factory.reset(new SpecialSkipListFactory(10)); + BlockBasedTableOptions table_options; + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + Status s; + + std::atomic range_sync_called(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SpecialEnv::SStableFile::RangeSync", [&](void* arg) { + if (range_sync_called.fetch_add(1) == 0) { + Status* st = static_cast(arg); + *st = Status::IOError("range sync dummy error"); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + std::string rnd_str = + RandomString(&rnd, static_cast(options.bytes_per_sync / 2)); + std::string rnd_str_512kb = RandomString(&rnd, 512 * 1024); + + ASSERT_OK(Put(1, "foo", "bar")); + // First 1MB doesn't get range synced + ASSERT_OK(Put(1, "foo0_0", rnd_str_512kb)); + ASSERT_OK(Put(1, "foo0_1", rnd_str_512kb)); + ASSERT_OK(Put(1, "foo1_1", rnd_str)); + ASSERT_OK(Put(1, "foo1_2", rnd_str)); + ASSERT_OK(Put(1, "foo1_3", rnd_str)); + ASSERT_OK(Put(1, "foo2", "bar")); + ASSERT_OK(Put(1, "foo3_1", rnd_str)); + ASSERT_OK(Put(1, "foo3_2", rnd_str)); + ASSERT_OK(Put(1, "foo3_3", rnd_str)); + ASSERT_OK(Put(1, "foo4", "bar")); + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + + // Following writes should fail as flush failed. + ASSERT_NOK(Put(1, "foo2", "bar3")); + ASSERT_EQ("bar", Get(1, "foo")); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ASSERT_GE(1, range_sync_called.load()); + + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ("bar", Get(1, "foo")); +} + +TEST_F(DBIOFailureTest, CompactSstRangeSyncError) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.error_if_exists = false; + options.paranoid_checks = true; + options.write_buffer_size = 256 * 1024 * 1024; + options.writable_file_max_buffer_size = 128 * 1024; + options.bytes_per_sync = 128 * 1024; + options.level0_file_num_compaction_trigger = 2; + options.target_file_size_base = 256 * 1024 * 1024; + options.disable_auto_compactions = true; + BlockBasedTableOptions table_options; + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + Status s; + + Random rnd(301); + std::string rnd_str = + RandomString(&rnd, static_cast(options.bytes_per_sync / 2)); + std::string rnd_str_512kb = RandomString(&rnd, 512 * 1024); + + ASSERT_OK(Put(1, "foo", "bar")); + // First 1MB doesn't get range synced + ASSERT_OK(Put(1, "foo0_0", rnd_str_512kb)); + ASSERT_OK(Put(1, "foo0_1", rnd_str_512kb)); + ASSERT_OK(Put(1, "foo1_1", rnd_str)); + ASSERT_OK(Put(1, "foo1_2", rnd_str)); + ASSERT_OK(Put(1, "foo1_3", rnd_str)); + Flush(1); + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Put(1, "foo3_1", rnd_str)); + ASSERT_OK(Put(1, "foo3_2", rnd_str)); + ASSERT_OK(Put(1, "foo3_3", rnd_str)); + ASSERT_OK(Put(1, "foo4", "bar")); + Flush(1); + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + + std::atomic range_sync_called(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SpecialEnv::SStableFile::RangeSync", [&](void* arg) { + if (range_sync_called.fetch_add(1) == 0) { + Status* st = static_cast(arg); + *st = Status::IOError("range sync dummy error"); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(dbfull()->SetOptions(handles_[1], + { + {"disable_auto_compactions", "false"}, + })); + dbfull()->TEST_WaitForCompact(); + + // Following writes should fail as flush failed. + ASSERT_NOK(Put(1, "foo2", "bar3")); + ASSERT_EQ("bar", Get(1, "foo")); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ASSERT_GE(1, range_sync_called.load()); + + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ("bar", Get(1, "foo")); +} + +TEST_F(DBIOFailureTest, FlushSstCloseError) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.error_if_exists = false; + options.paranoid_checks = true; + options.level0_file_num_compaction_trigger = 4; + options.memtable_factory.reset(new SpecialSkipListFactory(2)); + + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + Status s; + std::atomic close_called(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SpecialEnv::SStableFile::Close", [&](void* arg) { + if (close_called.fetch_add(1) == 0) { + Status* st = static_cast(arg); + *st = Status::IOError("close dummy error"); + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Put(1, "foo1", "bar1")); + ASSERT_OK(Put(1, "foo", "bar2")); + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + + // Following writes should fail as flush failed. + ASSERT_NOK(Put(1, "foo2", "bar3")); + ASSERT_EQ("bar2", Get(1, "foo")); + ASSERT_EQ("bar1", Get(1, "foo1")); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ("bar2", Get(1, "foo")); + ASSERT_EQ("bar1", Get(1, "foo1")); +} + +TEST_F(DBIOFailureTest, CompactionSstCloseError) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.error_if_exists = false; + options.paranoid_checks = true; + options.level0_file_num_compaction_trigger = 2; + options.disable_auto_compactions = true; + + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + Status s; + + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Put(1, "foo2", "bar")); + Flush(1); + ASSERT_OK(Put(1, "foo", "bar2")); + ASSERT_OK(Put(1, "foo2", "bar")); + Flush(1); + ASSERT_OK(Put(1, "foo", "bar3")); + ASSERT_OK(Put(1, "foo2", "bar")); + Flush(1); + dbfull()->TEST_WaitForCompact(); + + std::atomic close_called(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SpecialEnv::SStableFile::Close", [&](void* arg) { + if (close_called.fetch_add(1) == 0) { + Status* st = static_cast(arg); + *st = Status::IOError("close dummy error"); + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(dbfull()->SetOptions(handles_[1], + { + {"disable_auto_compactions", "false"}, + })); + dbfull()->TEST_WaitForCompact(); + + // Following writes should fail as compaction failed. + ASSERT_NOK(Put(1, "foo2", "bar3")); + ASSERT_EQ("bar3", Get(1, "foo")); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ("bar3", Get(1, "foo")); +} + +TEST_F(DBIOFailureTest, FlushSstSyncError) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.error_if_exists = false; + options.paranoid_checks = true; + options.use_fsync = false; + options.level0_file_num_compaction_trigger = 4; + options.memtable_factory.reset(new SpecialSkipListFactory(2)); + + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + Status s; + std::atomic sync_called(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SpecialEnv::SStableFile::Sync", [&](void* arg) { + if (sync_called.fetch_add(1) == 0) { + Status* st = static_cast(arg); + *st = Status::IOError("sync dummy error"); + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Put(1, "foo1", "bar1")); + ASSERT_OK(Put(1, "foo", "bar2")); + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + + // Following writes should fail as flush failed. + ASSERT_NOK(Put(1, "foo2", "bar3")); + ASSERT_EQ("bar2", Get(1, "foo")); + ASSERT_EQ("bar1", Get(1, "foo1")); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ("bar2", Get(1, "foo")); + ASSERT_EQ("bar1", Get(1, "foo1")); +} + +TEST_F(DBIOFailureTest, CompactionSstSyncError) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.error_if_exists = false; + options.paranoid_checks = true; + options.level0_file_num_compaction_trigger = 2; + options.disable_auto_compactions = true; + options.use_fsync = false; + + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + Status s; + + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Put(1, "foo2", "bar")); + Flush(1); + ASSERT_OK(Put(1, "foo", "bar2")); + ASSERT_OK(Put(1, "foo2", "bar")); + Flush(1); + ASSERT_OK(Put(1, "foo", "bar3")); + ASSERT_OK(Put(1, "foo2", "bar")); + Flush(1); + dbfull()->TEST_WaitForCompact(); + + std::atomic sync_called(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SpecialEnv::SStableFile::Sync", [&](void* arg) { + if (sync_called.fetch_add(1) == 0) { + Status* st = static_cast(arg); + *st = Status::IOError("close dummy error"); + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(dbfull()->SetOptions(handles_[1], + { + {"disable_auto_compactions", "false"}, + })); + dbfull()->TEST_WaitForCompact(); + + // Following writes should fail as compaction failed. + ASSERT_NOK(Put(1, "foo2", "bar3")); + ASSERT_EQ("bar3", Get(1, "foo")); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ("bar3", Get(1, "foo")); +} +#endif // !(defined NDEBUG) || !defined(OS_WIN) +#endif // ROCKSDB_LITE +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_iter.cc b/src/rocksdb/db/db_iter.cc new file mode 100644 index 000000000..e5d402948 --- /dev/null +++ b/src/rocksdb/db/db_iter.cc @@ -0,0 +1,1310 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_iter.h" +#include +#include +#include + +#include "db/dbformat.h" +#include "db/merge_context.h" +#include "db/merge_helper.h" +#include "db/pinned_iterators_manager.h" +#include "file/filename.h" +#include "logging/logging.h" +#include "memory/arena.h" +#include "monitoring/perf_context_imp.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/options.h" +#include "table/internal_iterator.h" +#include "table/iterator_wrapper.h" +#include "trace_replay/trace_replay.h" +#include "util/mutexlock.h" +#include "util/string_util.h" +#include "util/user_comparator_wrapper.h" + +namespace ROCKSDB_NAMESPACE { + +DBIter::DBIter(Env* _env, const ReadOptions& read_options, + const ImmutableCFOptions& cf_options, + const MutableCFOptions& mutable_cf_options, + const Comparator* cmp, InternalIterator* iter, SequenceNumber s, + bool arena_mode, uint64_t max_sequential_skip_in_iterations, + ReadCallback* read_callback, DBImpl* db_impl, + ColumnFamilyData* cfd, bool allow_blob) + : prefix_extractor_(mutable_cf_options.prefix_extractor.get()), + env_(_env), + logger_(cf_options.info_log), + user_comparator_(cmp), + merge_operator_(cf_options.merge_operator), + iter_(iter), + read_callback_(read_callback), + sequence_(s), + statistics_(cf_options.statistics), + num_internal_keys_skipped_(0), + iterate_lower_bound_(read_options.iterate_lower_bound), + iterate_upper_bound_(read_options.iterate_upper_bound), + direction_(kForward), + valid_(false), + current_entry_is_merged_(false), + is_key_seqnum_zero_(false), + prefix_same_as_start_(mutable_cf_options.prefix_extractor + ? read_options.prefix_same_as_start + : false), + pin_thru_lifetime_(read_options.pin_data), + expect_total_order_inner_iter_(prefix_extractor_ == nullptr || + read_options.total_order_seek || + read_options.auto_prefix_mode), + allow_blob_(allow_blob), + is_blob_(false), + arena_mode_(arena_mode), + range_del_agg_(&cf_options.internal_comparator, s), + db_impl_(db_impl), + cfd_(cfd), + start_seqnum_(read_options.iter_start_seqnum) { + RecordTick(statistics_, NO_ITERATOR_CREATED); + max_skip_ = max_sequential_skip_in_iterations; + max_skippable_internal_keys_ = read_options.max_skippable_internal_keys; + if (pin_thru_lifetime_) { + pinned_iters_mgr_.StartPinning(); + } + if (iter_.iter()) { + iter_.iter()->SetPinnedItersMgr(&pinned_iters_mgr_); + } +} + +Status DBIter::GetProperty(std::string prop_name, std::string* prop) { + if (prop == nullptr) { + return Status::InvalidArgument("prop is nullptr"); + } + if (prop_name == "rocksdb.iterator.super-version-number") { + // First try to pass the value returned from inner iterator. + return iter_.iter()->GetProperty(prop_name, prop); + } else if (prop_name == "rocksdb.iterator.is-key-pinned") { + if (valid_) { + *prop = (pin_thru_lifetime_ && saved_key_.IsKeyPinned()) ? "1" : "0"; + } else { + *prop = "Iterator is not valid."; + } + return Status::OK(); + } else if (prop_name == "rocksdb.iterator.internal-key") { + *prop = saved_key_.GetUserKey().ToString(); + return Status::OK(); + } + return Status::InvalidArgument("Unidentified property."); +} + +bool DBIter::ParseKey(ParsedInternalKey* ikey) { + if (!ParseInternalKey(iter_.key(), ikey)) { + status_ = Status::Corruption("corrupted internal key in DBIter"); + valid_ = false; + ROCKS_LOG_ERROR(logger_, "corrupted internal key in DBIter: %s", + iter_.key().ToString(true).c_str()); + return false; + } else { + return true; + } +} + +void DBIter::Next() { + assert(valid_); + assert(status_.ok()); + + PERF_CPU_TIMER_GUARD(iter_next_cpu_nanos, env_); + // Release temporarily pinned blocks from last operation + ReleaseTempPinnedData(); + local_stats_.skip_count_ += num_internal_keys_skipped_; + local_stats_.skip_count_--; + num_internal_keys_skipped_ = 0; + bool ok = true; + if (direction_ == kReverse) { + is_key_seqnum_zero_ = false; + if (!ReverseToForward()) { + ok = false; + } + } else if (!current_entry_is_merged_) { + // If the current value is not a merge, the iter position is the + // current key, which is already returned. We can safely issue a + // Next() without checking the current key. + // If the current key is a merge, very likely iter already points + // to the next internal position. + assert(iter_.Valid()); + iter_.Next(); + PERF_COUNTER_ADD(internal_key_skipped_count, 1); + } + + local_stats_.next_count_++; + if (ok && iter_.Valid()) { + Slice prefix; + if (prefix_same_as_start_) { + assert(prefix_extractor_ != nullptr); + prefix = prefix_.GetUserKey(); + } + FindNextUserEntry(true /* skipping the current user key */, + prefix_same_as_start_ ? &prefix : nullptr); + } else { + is_key_seqnum_zero_ = false; + valid_ = false; + } + if (statistics_ != nullptr && valid_) { + local_stats_.next_found_count_++; + local_stats_.bytes_read_ += (key().size() + value().size()); + } +} + +// PRE: saved_key_ has the current user key if skipping_saved_key +// POST: saved_key_ should have the next user key if valid_, +// if the current entry is a result of merge +// current_entry_is_merged_ => true +// saved_value_ => the merged value +// +// NOTE: In between, saved_key_ can point to a user key that has +// a delete marker or a sequence number higher than sequence_ +// saved_key_ MUST have a proper user_key before calling this function +// +// The prefix parameter, if not null, indicates that we need to iterate +// within the prefix, and the iterator needs to be made invalid, if no +// more entry for the prefix can be found. +bool DBIter::FindNextUserEntry(bool skipping_saved_key, const Slice* prefix) { + PERF_TIMER_GUARD(find_next_user_entry_time); + return FindNextUserEntryInternal(skipping_saved_key, prefix); +} + +// Actual implementation of DBIter::FindNextUserEntry() +bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key, + const Slice* prefix) { + // Loop until we hit an acceptable entry to yield + assert(iter_.Valid()); + assert(status_.ok()); + assert(direction_ == kForward); + current_entry_is_merged_ = false; + + // How many times in a row we have skipped an entry with user key less than + // or equal to saved_key_. We could skip these entries either because + // sequence numbers were too high or because skipping_saved_key = true. + // What saved_key_ contains throughout this method: + // - if skipping_saved_key : saved_key_ contains the key that we need + // to skip, and we haven't seen any keys greater + // than that, + // - if num_skipped > 0 : saved_key_ contains the key that we have skipped + // num_skipped times, and we haven't seen any keys + // greater than that, + // - none of the above : saved_key_ can contain anything, it doesn't + // matter. + uint64_t num_skipped = 0; + // For write unprepared, the target sequence number in reseek could be larger + // than the snapshot, and thus needs to be skipped again. This could result in + // an infinite loop of reseeks. To avoid that, we limit the number of reseeks + // to one. + bool reseek_done = false; + + is_blob_ = false; + + do { + // Will update is_key_seqnum_zero_ as soon as we parsed the current key + // but we need to save the previous value to be used in the loop. + bool is_prev_key_seqnum_zero = is_key_seqnum_zero_; + if (!ParseKey(&ikey_)) { + is_key_seqnum_zero_ = false; + return false; + } + + is_key_seqnum_zero_ = (ikey_.sequence == 0); + + assert(iterate_upper_bound_ == nullptr || iter_.MayBeOutOfUpperBound() || + user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) < 0); + if (iterate_upper_bound_ != nullptr && iter_.MayBeOutOfUpperBound() && + user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) >= 0) { + break; + } + + assert(prefix == nullptr || prefix_extractor_ != nullptr); + if (prefix != nullptr && + prefix_extractor_->Transform(ikey_.user_key).compare(*prefix) != 0) { + assert(prefix_same_as_start_); + break; + } + + if (TooManyInternalKeysSkipped()) { + return false; + } + + if (IsVisible(ikey_.sequence)) { + // If the previous entry is of seqnum 0, the current entry will not + // possibly be skipped. This condition can potentially be relaxed to + // prev_key.seq <= ikey_.sequence. We are cautious because it will be more + // prone to bugs causing the same user key with the same sequence number. + if (!is_prev_key_seqnum_zero && skipping_saved_key && + user_comparator_.Compare(ikey_.user_key, saved_key_.GetUserKey()) <= + 0) { + num_skipped++; // skip this entry + PERF_COUNTER_ADD(internal_key_skipped_count, 1); + } else { + assert(!skipping_saved_key || + user_comparator_.Compare(ikey_.user_key, + saved_key_.GetUserKey()) > 0); + num_skipped = 0; + reseek_done = false; + switch (ikey_.type) { + case kTypeDeletion: + case kTypeSingleDeletion: + // Arrange to skip all upcoming entries for this key since + // they are hidden by this deletion. + // if iterartor specified start_seqnum we + // 1) return internal key, including the type + // 2) return ikey only if ikey.seqnum >= start_seqnum_ + // note that if deletion seqnum is < start_seqnum_ we + // just skip it like in normal iterator. + if (start_seqnum_ > 0 && ikey_.sequence >= start_seqnum_) { + saved_key_.SetInternalKey(ikey_); + valid_ = true; + return true; + } else { + saved_key_.SetUserKey( + ikey_.user_key, !pin_thru_lifetime_ || + !iter_.iter()->IsKeyPinned() /* copy */); + skipping_saved_key = true; + PERF_COUNTER_ADD(internal_delete_skipped_count, 1); + } + break; + case kTypeValue: + case kTypeBlobIndex: + if (start_seqnum_ > 0) { + // we are taking incremental snapshot here + // incremental snapshots aren't supported on DB with range deletes + assert(ikey_.type != kTypeBlobIndex); + if (ikey_.sequence >= start_seqnum_) { + saved_key_.SetInternalKey(ikey_); + valid_ = true; + return true; + } else { + // this key and all previous versions shouldn't be included, + // skipping_saved_key + saved_key_.SetUserKey( + ikey_.user_key, + !pin_thru_lifetime_ || + !iter_.iter()->IsKeyPinned() /* copy */); + skipping_saved_key = true; + } + } else { + saved_key_.SetUserKey( + ikey_.user_key, !pin_thru_lifetime_ || + !iter_.iter()->IsKeyPinned() /* copy */); + if (range_del_agg_.ShouldDelete( + ikey_, RangeDelPositioningMode::kForwardTraversal)) { + // Arrange to skip all upcoming entries for this key since + // they are hidden by this deletion. + skipping_saved_key = true; + num_skipped = 0; + reseek_done = false; + PERF_COUNTER_ADD(internal_delete_skipped_count, 1); + } else if (ikey_.type == kTypeBlobIndex) { + if (!allow_blob_) { + ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index."); + status_ = Status::NotSupported( + "Encounter unexpected blob index. Please open DB with " + "ROCKSDB_NAMESPACE::blob_db::BlobDB instead."); + valid_ = false; + return false; + } + + is_blob_ = true; + valid_ = true; + return true; + } else { + valid_ = true; + return true; + } + } + break; + case kTypeMerge: + saved_key_.SetUserKey( + ikey_.user_key, + !pin_thru_lifetime_ || !iter_.iter()->IsKeyPinned() /* copy */); + if (range_del_agg_.ShouldDelete( + ikey_, RangeDelPositioningMode::kForwardTraversal)) { + // Arrange to skip all upcoming entries for this key since + // they are hidden by this deletion. + skipping_saved_key = true; + num_skipped = 0; + reseek_done = false; + PERF_COUNTER_ADD(internal_delete_skipped_count, 1); + } else { + // By now, we are sure the current ikey is going to yield a + // value + current_entry_is_merged_ = true; + valid_ = true; + return MergeValuesNewToOld(); // Go to a different state machine + } + break; + default: + assert(false); + break; + } + } + } else { + PERF_COUNTER_ADD(internal_recent_skipped_count, 1); + + // This key was inserted after our snapshot was taken. + // If this happens too many times in a row for the same user key, we want + // to seek to the target sequence number. + int cmp = + user_comparator_.Compare(ikey_.user_key, saved_key_.GetUserKey()); + if (cmp == 0 || (skipping_saved_key && cmp < 0)) { + num_skipped++; + } else { + saved_key_.SetUserKey( + ikey_.user_key, + !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */); + skipping_saved_key = false; + num_skipped = 0; + reseek_done = false; + } + } + + // If we have sequentially iterated via numerous equal keys, then it's + // better to seek so that we can avoid too many key comparisons. + // + // To avoid infinite loops, do not reseek if we have already attempted to + // reseek previously. + // + // TODO(lth): If we reseek to sequence number greater than ikey_.sequence, + // then it does not make sense to reseek as we would actually land further + // away from the desired key. There is opportunity for optimization here. + if (num_skipped > max_skip_ && !reseek_done) { + is_key_seqnum_zero_ = false; + num_skipped = 0; + reseek_done = true; + std::string last_key; + if (skipping_saved_key) { + // We're looking for the next user-key but all we see are the same + // user-key with decreasing sequence numbers. Fast forward to + // sequence number 0 and type deletion (the smallest type). + AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetUserKey(), + 0, kTypeDeletion)); + // Don't set skipping_saved_key = false because we may still see more + // user-keys equal to saved_key_. + } else { + // We saw multiple entries with this user key and sequence numbers + // higher than sequence_. Fast forward to sequence_. + // Note that this only covers a case when a higher key was overwritten + // many times since our snapshot was taken, not the case when a lot of + // different keys were inserted after our snapshot was taken. + AppendInternalKey(&last_key, + ParsedInternalKey(saved_key_.GetUserKey(), sequence_, + kValueTypeForSeek)); + } + iter_.Seek(last_key); + RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION); + } else { + iter_.Next(); + } + } while (iter_.Valid()); + + valid_ = false; + return iter_.status().ok(); +} + +// Merge values of the same user key starting from the current iter_ position +// Scan from the newer entries to older entries. +// PRE: iter_.key() points to the first merge type entry +// saved_key_ stores the user key +// POST: saved_value_ has the merged value for the user key +// iter_ points to the next entry (or invalid) +bool DBIter::MergeValuesNewToOld() { + if (!merge_operator_) { + ROCKS_LOG_ERROR(logger_, "Options::merge_operator is null."); + status_ = Status::InvalidArgument("merge_operator_ must be set."); + valid_ = false; + return false; + } + + // Temporarily pin the blocks that hold merge operands + TempPinData(); + merge_context_.Clear(); + // Start the merge process by pushing the first operand + merge_context_.PushOperand( + iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */); + TEST_SYNC_POINT("DBIter::MergeValuesNewToOld:PushedFirstOperand"); + + ParsedInternalKey ikey; + Status s; + for (iter_.Next(); iter_.Valid(); iter_.Next()) { + TEST_SYNC_POINT("DBIter::MergeValuesNewToOld:SteppedToNextOperand"); + if (!ParseKey(&ikey)) { + return false; + } + + if (!user_comparator_.Equal(ikey.user_key, saved_key_.GetUserKey())) { + // hit the next user key, stop right here + break; + } else if (kTypeDeletion == ikey.type || kTypeSingleDeletion == ikey.type || + range_del_agg_.ShouldDelete( + ikey, RangeDelPositioningMode::kForwardTraversal)) { + // hit a delete with the same user key, stop right here + // iter_ is positioned after delete + iter_.Next(); + break; + } else if (kTypeValue == ikey.type) { + // hit a put, merge the put value with operands and store the + // final result in saved_value_. We are done! + const Slice val = iter_.value(); + s = MergeHelper::TimedFullMerge( + merge_operator_, ikey.user_key, &val, merge_context_.GetOperands(), + &saved_value_, logger_, statistics_, env_, &pinned_value_, true); + if (!s.ok()) { + valid_ = false; + status_ = s; + return false; + } + // iter_ is positioned after put + iter_.Next(); + if (!iter_.status().ok()) { + valid_ = false; + return false; + } + return true; + } else if (kTypeMerge == ikey.type) { + // hit a merge, add the value as an operand and run associative merge. + // when complete, add result to operands and continue. + merge_context_.PushOperand( + iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */); + PERF_COUNTER_ADD(internal_merge_count, 1); + } else if (kTypeBlobIndex == ikey.type) { + if (!allow_blob_) { + ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index."); + status_ = Status::NotSupported( + "Encounter unexpected blob index. Please open DB with " + "ROCKSDB_NAMESPACE::blob_db::BlobDB instead."); + } else { + status_ = + Status::NotSupported("Blob DB does not support merge operator."); + } + valid_ = false; + return false; + } else { + assert(false); + } + } + + if (!iter_.status().ok()) { + valid_ = false; + return false; + } + + // we either exhausted all internal keys under this user key, or hit + // a deletion marker. + // feed null as the existing value to the merge operator, such that + // client can differentiate this scenario and do things accordingly. + s = MergeHelper::TimedFullMerge(merge_operator_, saved_key_.GetUserKey(), + nullptr, merge_context_.GetOperands(), + &saved_value_, logger_, statistics_, env_, + &pinned_value_, true); + if (!s.ok()) { + valid_ = false; + status_ = s; + return false; + } + + assert(status_.ok()); + return true; +} + +void DBIter::Prev() { + assert(valid_); + assert(status_.ok()); + + PERF_CPU_TIMER_GUARD(iter_prev_cpu_nanos, env_); + ReleaseTempPinnedData(); + ResetInternalKeysSkippedCounter(); + bool ok = true; + if (direction_ == kForward) { + if (!ReverseToBackward()) { + ok = false; + } + } + if (ok) { + Slice prefix; + if (prefix_same_as_start_) { + assert(prefix_extractor_ != nullptr); + prefix = prefix_.GetUserKey(); + } + PrevInternal(prefix_same_as_start_ ? &prefix : nullptr); + } + + if (statistics_ != nullptr) { + local_stats_.prev_count_++; + if (valid_) { + local_stats_.prev_found_count_++; + local_stats_.bytes_read_ += (key().size() + value().size()); + } + } +} + +bool DBIter::ReverseToForward() { + assert(iter_.status().ok()); + + // When moving backwards, iter_ is positioned on _previous_ key, which may + // not exist or may have different prefix than the current key(). + // If that's the case, seek iter_ to current key. + if (!expect_total_order_inner_iter() || !iter_.Valid()) { + IterKey last_key; + last_key.SetInternalKey(ParsedInternalKey( + saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek)); + iter_.Seek(last_key.GetInternalKey()); + } + + direction_ = kForward; + // Skip keys less than the current key() (a.k.a. saved_key_). + while (iter_.Valid()) { + ParsedInternalKey ikey; + if (!ParseKey(&ikey)) { + return false; + } + if (user_comparator_.Compare(ikey.user_key, saved_key_.GetUserKey()) >= 0) { + return true; + } + iter_.Next(); + } + + if (!iter_.status().ok()) { + valid_ = false; + return false; + } + + return true; +} + +// Move iter_ to the key before saved_key_. +bool DBIter::ReverseToBackward() { + assert(iter_.status().ok()); + + // When current_entry_is_merged_ is true, iter_ may be positioned on the next + // key, which may not exist or may have prefix different from current. + // If that's the case, seek to saved_key_. + if (current_entry_is_merged_ && + (!expect_total_order_inner_iter() || !iter_.Valid())) { + IterKey last_key; + // Using kMaxSequenceNumber and kValueTypeForSeek + // (not kValueTypeForSeekForPrev) to seek to a key strictly smaller + // than saved_key_. + last_key.SetInternalKey(ParsedInternalKey( + saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek)); + if (!expect_total_order_inner_iter()) { + iter_.SeekForPrev(last_key.GetInternalKey()); + } else { + // Some iterators may not support SeekForPrev(), so we avoid using it + // when prefix seek mode is disabled. This is somewhat expensive + // (an extra Prev(), as well as an extra change of direction of iter_), + // so we may need to reconsider it later. + iter_.Seek(last_key.GetInternalKey()); + if (!iter_.Valid() && iter_.status().ok()) { + iter_.SeekToLast(); + } + } + } + + direction_ = kReverse; + return FindUserKeyBeforeSavedKey(); +} + +void DBIter::PrevInternal(const Slice* prefix) { + while (iter_.Valid()) { + saved_key_.SetUserKey( + ExtractUserKey(iter_.key()), + !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */); + + assert(prefix == nullptr || prefix_extractor_ != nullptr); + if (prefix != nullptr && + prefix_extractor_->Transform(saved_key_.GetUserKey()) + .compare(*prefix) != 0) { + assert(prefix_same_as_start_); + // Current key does not have the same prefix as start + valid_ = false; + return; + } + + assert(iterate_lower_bound_ == nullptr || iter_.MayBeOutOfLowerBound() || + user_comparator_.Compare(saved_key_.GetUserKey(), + *iterate_lower_bound_) >= 0); + if (iterate_lower_bound_ != nullptr && iter_.MayBeOutOfLowerBound() && + user_comparator_.Compare(saved_key_.GetUserKey(), + *iterate_lower_bound_) < 0) { + // We've iterated earlier than the user-specified lower bound. + valid_ = false; + return; + } + + if (!FindValueForCurrentKey()) { // assigns valid_ + return; + } + + // Whether or not we found a value for current key, we need iter_ to end up + // on a smaller key. + if (!FindUserKeyBeforeSavedKey()) { + return; + } + + if (valid_) { + // Found the value. + return; + } + + if (TooManyInternalKeysSkipped(false)) { + return; + } + } + + // We haven't found any key - iterator is not valid + valid_ = false; +} + +// Used for backwards iteration. +// Looks at the entries with user key saved_key_ and finds the most up-to-date +// value for it, or executes a merge, or determines that the value was deleted. +// Sets valid_ to true if the value is found and is ready to be presented to +// the user through value(). +// Sets valid_ to false if the value was deleted, and we should try another key. +// Returns false if an error occurred, and !status().ok() and !valid_. +// +// PRE: iter_ is positioned on the last entry with user key equal to saved_key_. +// POST: iter_ is positioned on one of the entries equal to saved_key_, or on +// the entry just before them, or on the entry just after them. +bool DBIter::FindValueForCurrentKey() { + assert(iter_.Valid()); + merge_context_.Clear(); + current_entry_is_merged_ = false; + // last entry before merge (could be kTypeDeletion, kTypeSingleDeletion or + // kTypeValue) + ValueType last_not_merge_type = kTypeDeletion; + ValueType last_key_entry_type = kTypeDeletion; + + // Temporarily pin blocks that hold (merge operands / the value) + ReleaseTempPinnedData(); + TempPinData(); + size_t num_skipped = 0; + while (iter_.Valid()) { + ParsedInternalKey ikey; + if (!ParseKey(&ikey)) { + return false; + } + + if (!IsVisible(ikey.sequence) || + !user_comparator_.Equal(ikey.user_key, saved_key_.GetUserKey())) { + break; + } + if (TooManyInternalKeysSkipped()) { + return false; + } + + // This user key has lots of entries. + // We're going from old to new, and it's taking too long. Let's do a Seek() + // and go from new to old. This helps when a key was overwritten many times. + if (num_skipped >= max_skip_) { + return FindValueForCurrentKeyUsingSeek(); + } + + last_key_entry_type = ikey.type; + switch (last_key_entry_type) { + case kTypeValue: + case kTypeBlobIndex: + if (range_del_agg_.ShouldDelete( + ikey, RangeDelPositioningMode::kBackwardTraversal)) { + last_key_entry_type = kTypeRangeDeletion; + PERF_COUNTER_ADD(internal_delete_skipped_count, 1); + } else { + assert(iter_.iter()->IsValuePinned()); + pinned_value_ = iter_.value(); + } + merge_context_.Clear(); + last_not_merge_type = last_key_entry_type; + break; + case kTypeDeletion: + case kTypeSingleDeletion: + merge_context_.Clear(); + last_not_merge_type = last_key_entry_type; + PERF_COUNTER_ADD(internal_delete_skipped_count, 1); + break; + case kTypeMerge: + if (range_del_agg_.ShouldDelete( + ikey, RangeDelPositioningMode::kBackwardTraversal)) { + merge_context_.Clear(); + last_key_entry_type = kTypeRangeDeletion; + last_not_merge_type = last_key_entry_type; + PERF_COUNTER_ADD(internal_delete_skipped_count, 1); + } else { + assert(merge_operator_ != nullptr); + merge_context_.PushOperandBack( + iter_.value(), + iter_.iter()->IsValuePinned() /* operand_pinned */); + PERF_COUNTER_ADD(internal_merge_count, 1); + } + break; + default: + assert(false); + } + + PERF_COUNTER_ADD(internal_key_skipped_count, 1); + iter_.Prev(); + ++num_skipped; + } + + if (!iter_.status().ok()) { + valid_ = false; + return false; + } + + Status s; + is_blob_ = false; + switch (last_key_entry_type) { + case kTypeDeletion: + case kTypeSingleDeletion: + case kTypeRangeDeletion: + valid_ = false; + return true; + case kTypeMerge: + current_entry_is_merged_ = true; + if (last_not_merge_type == kTypeDeletion || + last_not_merge_type == kTypeSingleDeletion || + last_not_merge_type == kTypeRangeDeletion) { + s = MergeHelper::TimedFullMerge( + merge_operator_, saved_key_.GetUserKey(), nullptr, + merge_context_.GetOperands(), &saved_value_, logger_, statistics_, + env_, &pinned_value_, true); + } else if (last_not_merge_type == kTypeBlobIndex) { + if (!allow_blob_) { + ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index."); + status_ = Status::NotSupported( + "Encounter unexpected blob index. Please open DB with " + "ROCKSDB_NAMESPACE::blob_db::BlobDB instead."); + } else { + status_ = + Status::NotSupported("Blob DB does not support merge operator."); + } + valid_ = false; + return false; + } else { + assert(last_not_merge_type == kTypeValue); + s = MergeHelper::TimedFullMerge( + merge_operator_, saved_key_.GetUserKey(), &pinned_value_, + merge_context_.GetOperands(), &saved_value_, logger_, statistics_, + env_, &pinned_value_, true); + } + break; + case kTypeValue: + // do nothing - we've already has value in pinned_value_ + break; + case kTypeBlobIndex: + if (!allow_blob_) { + ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index."); + status_ = Status::NotSupported( + "Encounter unexpected blob index. Please open DB with " + "ROCKSDB_NAMESPACE::blob_db::BlobDB instead."); + valid_ = false; + return false; + } + is_blob_ = true; + break; + default: + assert(false); + break; + } + if (!s.ok()) { + valid_ = false; + status_ = s; + return false; + } + valid_ = true; + return true; +} + +// This function is used in FindValueForCurrentKey. +// We use Seek() function instead of Prev() to find necessary value +// TODO: This is very similar to FindNextUserEntry() and MergeValuesNewToOld(). +// Would be nice to reuse some code. +bool DBIter::FindValueForCurrentKeyUsingSeek() { + // FindValueForCurrentKey will enable pinning before calling + // FindValueForCurrentKeyUsingSeek() + assert(pinned_iters_mgr_.PinningEnabled()); + std::string last_key; + AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetUserKey(), + sequence_, kValueTypeForSeek)); + iter_.Seek(last_key); + RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION); + + // In case read_callback presents, the value we seek to may not be visible. + // Find the next value that's visible. + ParsedInternalKey ikey; + is_blob_ = false; + while (true) { + if (!iter_.Valid()) { + valid_ = false; + return iter_.status().ok(); + } + + if (!ParseKey(&ikey)) { + return false; + } + if (!user_comparator_.Equal(ikey.user_key, saved_key_.GetUserKey())) { + // No visible values for this key, even though FindValueForCurrentKey() + // has seen some. This is possible if we're using a tailing iterator, and + // the entries were discarded in a compaction. + valid_ = false; + return true; + } + + if (IsVisible(ikey.sequence)) { + break; + } + + iter_.Next(); + } + + if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion || + range_del_agg_.ShouldDelete( + ikey, RangeDelPositioningMode::kBackwardTraversal)) { + valid_ = false; + return true; + } + if (ikey.type == kTypeBlobIndex && !allow_blob_) { + ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index."); + status_ = Status::NotSupported( + "Encounter unexpected blob index. Please open DB with " + "ROCKSDB_NAMESPACE::blob_db::BlobDB instead."); + valid_ = false; + return false; + } + if (ikey.type == kTypeValue || ikey.type == kTypeBlobIndex) { + assert(iter_.iter()->IsValuePinned()); + pinned_value_ = iter_.value(); + is_blob_ = (ikey.type == kTypeBlobIndex); + valid_ = true; + return true; + } + + // kTypeMerge. We need to collect all kTypeMerge values and save them + // in operands + assert(ikey.type == kTypeMerge); + current_entry_is_merged_ = true; + merge_context_.Clear(); + merge_context_.PushOperand( + iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */); + while (true) { + iter_.Next(); + + if (!iter_.Valid()) { + if (!iter_.status().ok()) { + valid_ = false; + return false; + } + break; + } + if (!ParseKey(&ikey)) { + return false; + } + if (!user_comparator_.Equal(ikey.user_key, saved_key_.GetUserKey())) { + break; + } + + if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion || + range_del_agg_.ShouldDelete( + ikey, RangeDelPositioningMode::kForwardTraversal)) { + break; + } else if (ikey.type == kTypeValue) { + const Slice val = iter_.value(); + Status s = MergeHelper::TimedFullMerge( + merge_operator_, saved_key_.GetUserKey(), &val, + merge_context_.GetOperands(), &saved_value_, logger_, statistics_, + env_, &pinned_value_, true); + if (!s.ok()) { + valid_ = false; + status_ = s; + return false; + } + valid_ = true; + return true; + } else if (ikey.type == kTypeMerge) { + merge_context_.PushOperand( + iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */); + PERF_COUNTER_ADD(internal_merge_count, 1); + } else if (ikey.type == kTypeBlobIndex) { + if (!allow_blob_) { + ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index."); + status_ = Status::NotSupported( + "Encounter unexpected blob index. Please open DB with " + "ROCKSDB_NAMESPACE::blob_db::BlobDB instead."); + } else { + status_ = + Status::NotSupported("Blob DB does not support merge operator."); + } + valid_ = false; + return false; + } else { + assert(false); + } + } + + Status s = MergeHelper::TimedFullMerge( + merge_operator_, saved_key_.GetUserKey(), nullptr, + merge_context_.GetOperands(), &saved_value_, logger_, statistics_, env_, + &pinned_value_, true); + if (!s.ok()) { + valid_ = false; + status_ = s; + return false; + } + + // Make sure we leave iter_ in a good state. If it's valid and we don't care + // about prefixes, that's already good enough. Otherwise it needs to be + // seeked to the current key. + if (!expect_total_order_inner_iter() || !iter_.Valid()) { + if (!expect_total_order_inner_iter()) { + iter_.SeekForPrev(last_key); + } else { + iter_.Seek(last_key); + if (!iter_.Valid() && iter_.status().ok()) { + iter_.SeekToLast(); + } + } + RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION); + } + + valid_ = true; + return true; +} + +// Move backwards until the key smaller than saved_key_. +// Changes valid_ only if return value is false. +bool DBIter::FindUserKeyBeforeSavedKey() { + assert(status_.ok()); + size_t num_skipped = 0; + while (iter_.Valid()) { + ParsedInternalKey ikey; + if (!ParseKey(&ikey)) { + return false; + } + + if (user_comparator_.Compare(ikey.user_key, saved_key_.GetUserKey()) < 0) { + return true; + } + + if (TooManyInternalKeysSkipped()) { + return false; + } + + assert(ikey.sequence != kMaxSequenceNumber); + if (!IsVisible(ikey.sequence)) { + PERF_COUNTER_ADD(internal_recent_skipped_count, 1); + } else { + PERF_COUNTER_ADD(internal_key_skipped_count, 1); + } + + if (num_skipped >= max_skip_) { + num_skipped = 0; + IterKey last_key; + last_key.SetInternalKey(ParsedInternalKey( + saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek)); + // It would be more efficient to use SeekForPrev() here, but some + // iterators may not support it. + iter_.Seek(last_key.GetInternalKey()); + RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION); + if (!iter_.Valid()) { + break; + } + } else { + ++num_skipped; + } + + iter_.Prev(); + } + + if (!iter_.status().ok()) { + valid_ = false; + return false; + } + + return true; +} + +bool DBIter::TooManyInternalKeysSkipped(bool increment) { + if ((max_skippable_internal_keys_ > 0) && + (num_internal_keys_skipped_ > max_skippable_internal_keys_)) { + valid_ = false; + status_ = Status::Incomplete("Too many internal keys skipped."); + return true; + } else if (increment) { + num_internal_keys_skipped_++; + } + return false; +} + +bool DBIter::IsVisible(SequenceNumber sequence) { + if (read_callback_ == nullptr) { + return sequence <= sequence_; + } else { + return read_callback_->IsVisible(sequence); + } +} + +void DBIter::SetSavedKeyToSeekTarget(const Slice& target) { + is_key_seqnum_zero_ = false; + SequenceNumber seq = sequence_; + saved_key_.Clear(); + saved_key_.SetInternalKey(target, seq); + + if (iterate_lower_bound_ != nullptr && + user_comparator_.Compare(saved_key_.GetUserKey(), *iterate_lower_bound_) < + 0) { + // Seek key is smaller than the lower bound. + saved_key_.Clear(); + saved_key_.SetInternalKey(*iterate_lower_bound_, seq); + } +} + +void DBIter::SetSavedKeyToSeekForPrevTarget(const Slice& target) { + is_key_seqnum_zero_ = false; + saved_key_.Clear(); + // now saved_key is used to store internal key. + saved_key_.SetInternalKey(target, 0 /* sequence_number */, + kValueTypeForSeekForPrev); + + if (iterate_upper_bound_ != nullptr && + user_comparator_.Compare(saved_key_.GetUserKey(), + *iterate_upper_bound_) >= 0) { + saved_key_.Clear(); + saved_key_.SetInternalKey(*iterate_upper_bound_, kMaxSequenceNumber); + } +} + +void DBIter::Seek(const Slice& target) { + PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_); + StopWatch sw(env_, statistics_, DB_SEEK); + +#ifndef ROCKSDB_LITE + if (db_impl_ != nullptr && cfd_ != nullptr) { + db_impl_->TraceIteratorSeek(cfd_->GetID(), target); + } +#endif // ROCKSDB_LITE + + status_ = Status::OK(); + ReleaseTempPinnedData(); + ResetInternalKeysSkippedCounter(); + + // Seek the inner iterator based on the target key. + { + PERF_TIMER_GUARD(seek_internal_seek_time); + + SetSavedKeyToSeekTarget(target); + iter_.Seek(saved_key_.GetInternalKey()); + + range_del_agg_.InvalidateRangeDelMapPositions(); + RecordTick(statistics_, NUMBER_DB_SEEK); + } + if (!iter_.Valid()) { + valid_ = false; + return; + } + direction_ = kForward; + + // Now the inner iterator is placed to the target position. From there, + // we need to find out the next key that is visible to the user. + ClearSavedValue(); + if (prefix_same_as_start_) { + // The case where the iterator needs to be invalidated if it has exausted + // keys within the same prefix of the seek key. + assert(prefix_extractor_ != nullptr); + Slice target_prefix = prefix_extractor_->Transform(target); + FindNextUserEntry(false /* not skipping saved_key */, + &target_prefix /* prefix */); + if (valid_) { + // Remember the prefix of the seek key for the future Next() call to + // check. + prefix_.SetUserKey(target_prefix); + } + } else { + FindNextUserEntry(false /* not skipping saved_key */, nullptr); + } + if (!valid_) { + return; + } + + // Updating stats and perf context counters. + if (statistics_ != nullptr) { + // Decrement since we don't want to count this key as skipped + RecordTick(statistics_, NUMBER_DB_SEEK_FOUND); + RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size()); + } + PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size()); +} + +void DBIter::SeekForPrev(const Slice& target) { + PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_); + StopWatch sw(env_, statistics_, DB_SEEK); + +#ifndef ROCKSDB_LITE + if (db_impl_ != nullptr && cfd_ != nullptr) { + db_impl_->TraceIteratorSeekForPrev(cfd_->GetID(), target); + } +#endif // ROCKSDB_LITE + + status_ = Status::OK(); + ReleaseTempPinnedData(); + ResetInternalKeysSkippedCounter(); + + // Seek the inner iterator based on the target key. + { + PERF_TIMER_GUARD(seek_internal_seek_time); + SetSavedKeyToSeekForPrevTarget(target); + iter_.SeekForPrev(saved_key_.GetInternalKey()); + range_del_agg_.InvalidateRangeDelMapPositions(); + RecordTick(statistics_, NUMBER_DB_SEEK); + } + if (!iter_.Valid()) { + valid_ = false; + return; + } + direction_ = kReverse; + + // Now the inner iterator is placed to the target position. From there, + // we need to find out the first key that is visible to the user in the + // backward direction. + ClearSavedValue(); + if (prefix_same_as_start_) { + // The case where the iterator needs to be invalidated if it has exausted + // keys within the same prefix of the seek key. + assert(prefix_extractor_ != nullptr); + Slice target_prefix = prefix_extractor_->Transform(target); + PrevInternal(&target_prefix); + if (valid_) { + // Remember the prefix of the seek key for the future Prev() call to + // check. + prefix_.SetUserKey(target_prefix); + } + } else { + PrevInternal(nullptr); + } + + // Report stats and perf context. + if (statistics_ != nullptr && valid_) { + RecordTick(statistics_, NUMBER_DB_SEEK_FOUND); + RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size()); + PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size()); + } +} + +void DBIter::SeekToFirst() { + if (iterate_lower_bound_ != nullptr) { + Seek(*iterate_lower_bound_); + return; + } + PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_); + // Don't use iter_::Seek() if we set a prefix extractor + // because prefix seek will be used. + if (!expect_total_order_inner_iter()) { + max_skip_ = std::numeric_limits::max(); + } + status_ = Status::OK(); + direction_ = kForward; + ReleaseTempPinnedData(); + ResetInternalKeysSkippedCounter(); + ClearSavedValue(); + is_key_seqnum_zero_ = false; + + { + PERF_TIMER_GUARD(seek_internal_seek_time); + iter_.SeekToFirst(); + range_del_agg_.InvalidateRangeDelMapPositions(); + } + + RecordTick(statistics_, NUMBER_DB_SEEK); + if (iter_.Valid()) { + saved_key_.SetUserKey( + ExtractUserKey(iter_.key()), + !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */); + FindNextUserEntry(false /* not skipping saved_key */, + nullptr /* no prefix check */); + if (statistics_ != nullptr) { + if (valid_) { + RecordTick(statistics_, NUMBER_DB_SEEK_FOUND); + RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size()); + PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size()); + } + } + } else { + valid_ = false; + } + if (valid_ && prefix_same_as_start_) { + assert(prefix_extractor_ != nullptr); + prefix_.SetUserKey(prefix_extractor_->Transform(saved_key_.GetUserKey())); + } +} + +void DBIter::SeekToLast() { + if (iterate_upper_bound_ != nullptr) { + // Seek to last key strictly less than ReadOptions.iterate_upper_bound. + SeekForPrev(*iterate_upper_bound_); + if (Valid() && user_comparator_.Equal(*iterate_upper_bound_, key())) { + ReleaseTempPinnedData(); + PrevInternal(nullptr); + } + return; + } + + PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_); + // Don't use iter_::Seek() if we set a prefix extractor + // because prefix seek will be used. + if (!expect_total_order_inner_iter()) { + max_skip_ = std::numeric_limits::max(); + } + status_ = Status::OK(); + direction_ = kReverse; + ReleaseTempPinnedData(); + ResetInternalKeysSkippedCounter(); + ClearSavedValue(); + is_key_seqnum_zero_ = false; + + { + PERF_TIMER_GUARD(seek_internal_seek_time); + iter_.SeekToLast(); + range_del_agg_.InvalidateRangeDelMapPositions(); + } + PrevInternal(nullptr); + if (statistics_ != nullptr) { + RecordTick(statistics_, NUMBER_DB_SEEK); + if (valid_) { + RecordTick(statistics_, NUMBER_DB_SEEK_FOUND); + RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size()); + PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size()); + } + } + if (valid_ && prefix_same_as_start_) { + assert(prefix_extractor_ != nullptr); + prefix_.SetUserKey(prefix_extractor_->Transform(saved_key_.GetUserKey())); + } +} + +Iterator* NewDBIterator(Env* env, const ReadOptions& read_options, + const ImmutableCFOptions& cf_options, + const MutableCFOptions& mutable_cf_options, + const Comparator* user_key_comparator, + InternalIterator* internal_iter, + const SequenceNumber& sequence, + uint64_t max_sequential_skip_in_iterations, + ReadCallback* read_callback, DBImpl* db_impl, + ColumnFamilyData* cfd, bool allow_blob) { + DBIter* db_iter = new DBIter( + env, read_options, cf_options, mutable_cf_options, user_key_comparator, + internal_iter, sequence, false, max_sequential_skip_in_iterations, + read_callback, db_impl, cfd, allow_blob); + return db_iter; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_iter.h b/src/rocksdb/db/db_iter.h new file mode 100644 index 000000000..32704e4d5 --- /dev/null +++ b/src/rocksdb/db/db_iter.h @@ -0,0 +1,344 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include "db/db_impl/db_impl.h" +#include "db/dbformat.h" +#include "db/range_del_aggregator.h" +#include "memory/arena.h" +#include "options/cf_options.h" +#include "rocksdb/db.h" +#include "rocksdb/iterator.h" +#include "table/iterator_wrapper.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +// This file declares the factory functions of DBIter, in its original form +// or a wrapped form with class ArenaWrappedDBIter, which is defined here. +// Class DBIter, which is declared and implemented inside db_iter.cc, is +// an iterator that converts internal keys (yielded by an InternalIterator) +// that were live at the specified sequence number into appropriate user +// keys. +// Each internal key consists of a user key, a sequence number, and a value +// type. DBIter deals with multiple key versions, tombstones, merge operands, +// etc, and exposes an Iterator. +// For example, DBIter may wrap following InternalIterator: +// user key: AAA value: v3 seqno: 100 type: Put +// user key: AAA value: v2 seqno: 97 type: Put +// user key: AAA value: v1 seqno: 95 type: Put +// user key: BBB value: v1 seqno: 90 type: Put +// user key: BBC value: N/A seqno: 98 type: Delete +// user key: BBC value: v1 seqno: 95 type: Put +// If the snapshot passed in is 102, then the DBIter is expected to +// expose the following iterator: +// key: AAA value: v3 +// key: BBB value: v1 +// If the snapshot passed in is 96, then it should expose: +// key: AAA value: v1 +// key: BBB value: v1 +// key: BBC value: v1 +// + +// Memtables and sstables that make the DB representation contain +// (userkey,seq,type) => uservalue entries. DBIter +// combines multiple entries for the same userkey found in the DB +// representation into a single entry while accounting for sequence +// numbers, deletion markers, overwrites, etc. +class DBIter final : public Iterator { + public: + // The following is grossly complicated. TODO: clean it up + // Which direction is the iterator currently moving? + // (1) When moving forward: + // (1a) if current_entry_is_merged_ = false, the internal iterator is + // positioned at the exact entry that yields this->key(), this->value() + // (1b) if current_entry_is_merged_ = true, the internal iterator is + // positioned immediately after the last entry that contributed to the + // current this->value(). That entry may or may not have key equal to + // this->key(). + // (2) When moving backwards, the internal iterator is positioned + // just before all entries whose user key == this->key(). + enum Direction { kForward, kReverse }; + + // LocalStatistics contain Statistics counters that will be aggregated per + // each iterator instance and then will be sent to the global statistics when + // the iterator is destroyed. + // + // The purpose of this approach is to avoid perf regression happening + // when multiple threads bump the atomic counters from a DBIter::Next(). + struct LocalStatistics { + explicit LocalStatistics() { ResetCounters(); } + + void ResetCounters() { + next_count_ = 0; + next_found_count_ = 0; + prev_count_ = 0; + prev_found_count_ = 0; + bytes_read_ = 0; + skip_count_ = 0; + } + + void BumpGlobalStatistics(Statistics* global_statistics) { + RecordTick(global_statistics, NUMBER_DB_NEXT, next_count_); + RecordTick(global_statistics, NUMBER_DB_NEXT_FOUND, next_found_count_); + RecordTick(global_statistics, NUMBER_DB_PREV, prev_count_); + RecordTick(global_statistics, NUMBER_DB_PREV_FOUND, prev_found_count_); + RecordTick(global_statistics, ITER_BYTES_READ, bytes_read_); + RecordTick(global_statistics, NUMBER_ITER_SKIP, skip_count_); + PERF_COUNTER_ADD(iter_read_bytes, bytes_read_); + ResetCounters(); + } + + // Map to Tickers::NUMBER_DB_NEXT + uint64_t next_count_; + // Map to Tickers::NUMBER_DB_NEXT_FOUND + uint64_t next_found_count_; + // Map to Tickers::NUMBER_DB_PREV + uint64_t prev_count_; + // Map to Tickers::NUMBER_DB_PREV_FOUND + uint64_t prev_found_count_; + // Map to Tickers::ITER_BYTES_READ + uint64_t bytes_read_; + // Map to Tickers::NUMBER_ITER_SKIP + uint64_t skip_count_; + }; + + DBIter(Env* _env, const ReadOptions& read_options, + const ImmutableCFOptions& cf_options, + const MutableCFOptions& mutable_cf_options, const Comparator* cmp, + InternalIterator* iter, SequenceNumber s, bool arena_mode, + uint64_t max_sequential_skip_in_iterations, + ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd, + bool allow_blob); + + // No copying allowed + DBIter(const DBIter&) = delete; + void operator=(const DBIter&) = delete; + + ~DBIter() override { + // Release pinned data if any + if (pinned_iters_mgr_.PinningEnabled()) { + pinned_iters_mgr_.ReleasePinnedData(); + } + RecordTick(statistics_, NO_ITERATOR_DELETED); + ResetInternalKeysSkippedCounter(); + local_stats_.BumpGlobalStatistics(statistics_); + iter_.DeleteIter(arena_mode_); + } + void SetIter(InternalIterator* iter) { + assert(iter_.iter() == nullptr); + iter_.Set(iter); + iter_.iter()->SetPinnedItersMgr(&pinned_iters_mgr_); + } + ReadRangeDelAggregator* GetRangeDelAggregator() { return &range_del_agg_; } + + bool Valid() const override { return valid_; } + Slice key() const override { + assert(valid_); + if (start_seqnum_ > 0) { + return saved_key_.GetInternalKey(); + } else { + return saved_key_.GetUserKey(); + } + } + Slice value() const override { + assert(valid_); + if (current_entry_is_merged_) { + // If pinned_value_ is set then the result of merge operator is one of + // the merge operands and we should return it. + return pinned_value_.data() ? pinned_value_ : saved_value_; + } else if (direction_ == kReverse) { + return pinned_value_; + } else { + return iter_.value(); + } + } + Status status() const override { + if (status_.ok()) { + return iter_.status(); + } else { + assert(!valid_); + return status_; + } + } + bool IsBlob() const { + assert(valid_ && (allow_blob_ || !is_blob_)); + return is_blob_; + } + + Status GetProperty(std::string prop_name, std::string* prop) override; + + void Next() final override; + void Prev() final override; + void Seek(const Slice& target) final override; + void SeekForPrev(const Slice& target) final override; + void SeekToFirst() final override; + void SeekToLast() final override; + Env* env() const { return env_; } + void set_sequence(uint64_t s) { + sequence_ = s; + if (read_callback_) { + read_callback_->Refresh(s); + } + } + void set_valid(bool v) { valid_ = v; } + + private: + // For all methods in this block: + // PRE: iter_->Valid() && status_.ok() + // Return false if there was an error, and status() is non-ok, valid_ = false; + // in this case callers would usually stop what they were doing and return. + bool ReverseToForward(); + bool ReverseToBackward(); + // Set saved_key_ to the seek key to target, with proper sequence number set. + // It might get adjusted if the seek key is smaller than iterator lower bound. + void SetSavedKeyToSeekTarget(const Slice& target); + // Set saved_key_ to the seek key to target, with proper sequence number set. + // It might get adjusted if the seek key is larger than iterator upper bound. + void SetSavedKeyToSeekForPrevTarget(const Slice& target); + bool FindValueForCurrentKey(); + bool FindValueForCurrentKeyUsingSeek(); + bool FindUserKeyBeforeSavedKey(); + // If `skipping_saved_key` is true, the function will keep iterating until it + // finds a user key that is larger than `saved_key_`. + // If `prefix` is not null, the iterator needs to stop when all keys for the + // prefix are exhausted and the interator is set to invalid. + bool FindNextUserEntry(bool skipping_saved_key, const Slice* prefix); + // Internal implementation of FindNextUserEntry(). + bool FindNextUserEntryInternal(bool skipping_saved_key, const Slice* prefix); + bool ParseKey(ParsedInternalKey* key); + bool MergeValuesNewToOld(); + + // If prefix is not null, we need to set the iterator to invalid if no more + // entry can be found within the prefix. + void PrevInternal(const Slice* prefix); + bool TooManyInternalKeysSkipped(bool increment = true); + bool IsVisible(SequenceNumber sequence); + + // Temporarily pin the blocks that we encounter until ReleaseTempPinnedData() + // is called + void TempPinData() { + if (!pin_thru_lifetime_) { + pinned_iters_mgr_.StartPinning(); + } + } + + // Release blocks pinned by TempPinData() + void ReleaseTempPinnedData() { + if (!pin_thru_lifetime_ && pinned_iters_mgr_.PinningEnabled()) { + pinned_iters_mgr_.ReleasePinnedData(); + } + } + + inline void ClearSavedValue() { + if (saved_value_.capacity() > 1048576) { + std::string empty; + swap(empty, saved_value_); + } else { + saved_value_.clear(); + } + } + + inline void ResetInternalKeysSkippedCounter() { + local_stats_.skip_count_ += num_internal_keys_skipped_; + if (valid_) { + local_stats_.skip_count_--; + } + num_internal_keys_skipped_ = 0; + } + + bool expect_total_order_inner_iter() { + assert(expect_total_order_inner_iter_ || prefix_extractor_ != nullptr); + return expect_total_order_inner_iter_; + } + + const SliceTransform* prefix_extractor_; + Env* const env_; + Logger* logger_; + UserComparatorWrapper user_comparator_; + const MergeOperator* const merge_operator_; + IteratorWrapper iter_; + ReadCallback* read_callback_; + // Max visible sequence number. It is normally the snapshot seq unless we have + // uncommitted data in db as in WriteUnCommitted. + SequenceNumber sequence_; + + IterKey saved_key_; + // Reusable internal key data structure. This is only used inside one function + // and should not be used across functions. Reusing this object can reduce + // overhead of calling construction of the function if creating it each time. + ParsedInternalKey ikey_; + std::string saved_value_; + Slice pinned_value_; + // for prefix seek mode to support prev() + Statistics* statistics_; + uint64_t max_skip_; + uint64_t max_skippable_internal_keys_; + uint64_t num_internal_keys_skipped_; + const Slice* iterate_lower_bound_; + const Slice* iterate_upper_bound_; + + // The prefix of the seek key. It is only used when prefix_same_as_start_ + // is true and prefix extractor is not null. In Next() or Prev(), current keys + // will be checked against this prefix, so that the iterator can be + // invalidated if the keys in this prefix has been exhausted. Set it using + // SetUserKey() and use it using GetUserKey(). + IterKey prefix_; + + Status status_; + Direction direction_; + bool valid_; + bool current_entry_is_merged_; + // True if we know that the current entry's seqnum is 0. + // This information is used as that the next entry will be for another + // user key. + bool is_key_seqnum_zero_; + const bool prefix_same_as_start_; + // Means that we will pin all data blocks we read as long the Iterator + // is not deleted, will be true if ReadOptions::pin_data is true + const bool pin_thru_lifetime_; + // Expect the inner iterator to maintain a total order. + // prefix_extractor_ must be non-NULL if the value is false. + const bool expect_total_order_inner_iter_; + bool allow_blob_; + bool is_blob_; + bool arena_mode_; + // List of operands for merge operator. + MergeContext merge_context_; + ReadRangeDelAggregator range_del_agg_; + LocalStatistics local_stats_; + PinnedIteratorsManager pinned_iters_mgr_; +#ifdef ROCKSDB_LITE + ROCKSDB_FIELD_UNUSED +#endif + DBImpl* db_impl_; +#ifdef ROCKSDB_LITE + ROCKSDB_FIELD_UNUSED +#endif + ColumnFamilyData* cfd_; + // for diff snapshots we want the lower bound on the seqnum; + // if this value > 0 iterator will return internal keys + SequenceNumber start_seqnum_; +}; + +// Return a new iterator that converts internal keys (yielded by +// "*internal_iter") that were live at the specified `sequence` number +// into appropriate user keys. +extern Iterator* NewDBIterator( + Env* env, const ReadOptions& read_options, + const ImmutableCFOptions& cf_options, + const MutableCFOptions& mutable_cf_options, + const Comparator* user_key_comparator, InternalIterator* internal_iter, + const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations, + ReadCallback* read_callback, DBImpl* db_impl = nullptr, + ColumnFamilyData* cfd = nullptr, bool allow_blob = false); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_iter_stress_test.cc b/src/rocksdb/db/db_iter_stress_test.cc new file mode 100644 index 000000000..57cd9866e --- /dev/null +++ b/src/rocksdb/db/db_iter_stress_test.cc @@ -0,0 +1,654 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/db_iter.h" +#include "db/dbformat.h" +#include "rocksdb/comparator.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "test_util/testharness.h" +#include "util/random.h" +#include "util/string_util.h" +#include "utilities/merge_operators.h" + +#ifdef GFLAGS + +#include "util/gflags_compat.h" + +using GFLAGS_NAMESPACE::ParseCommandLineFlags; + +DEFINE_bool(verbose, false, + "Print huge, detailed trace. Intended for debugging failures."); + +#else + +void ParseCommandLineFlags(int*, char***, bool) {} +bool FLAGS_verbose = false; + +#endif + +namespace ROCKSDB_NAMESPACE { + +class DBIteratorStressTest : public testing::Test { + public: + Env* env_; + + DBIteratorStressTest() : env_(Env::Default()) {} +}; + +namespace { + +struct Entry { + std::string key; + ValueType type; // kTypeValue, kTypeDeletion, kTypeMerge + uint64_t sequence; + std::string ikey; // internal key, made from `key`, `sequence` and `type` + std::string value; + // If false, we'll pretend that this entry doesn't exist. + bool visible = true; + + bool operator<(const Entry& e) const { + if (key != e.key) return key < e.key; + return std::tie(sequence, type) > std::tie(e.sequence, e.type); + } +}; + +struct Data { + std::vector entries; + + // Indices in `entries` with `visible` = false. + std::vector hidden; + // Keys of entries whose `visible` changed since the last seek of iterators. + std::set recently_touched_keys; +}; + +struct StressTestIterator : public InternalIterator { + Data* data; + Random64* rnd; + InternalKeyComparator cmp; + + // Each operation will return error with this probability... + double error_probability = 0; + // ... and add/remove entries with this probability. + double mutation_probability = 0; + // The probability of adding vs removing entries will be chosen so that the + // amount of removed entries stays somewhat close to this number. + double target_hidden_fraction = 0; + // If true, print all mutations to stdout for debugging. + bool trace = false; + + int iter = -1; + Status status_; + + StressTestIterator(Data* _data, Random64* _rnd, const Comparator* _cmp) + : data(_data), rnd(_rnd), cmp(_cmp) {} + + bool Valid() const override { + if (iter >= 0 && iter < (int)data->entries.size()) { + assert(status_.ok()); + return true; + } + return false; + } + + Status status() const override { return status_; } + + bool MaybeFail() { + if (rnd->Next() >= + std::numeric_limits::max() * error_probability) { + return false; + } + if (rnd->Next() % 2) { + status_ = Status::Incomplete("test"); + } else { + status_ = Status::IOError("test"); + } + if (trace) { + std::cout << "injecting " << status_.ToString() << std::endl; + } + iter = -1; + return true; + } + + void MaybeMutate() { + if (rnd->Next() >= + std::numeric_limits::max() * mutation_probability) { + return; + } + do { + // If too many entries are hidden, hide less, otherwise hide more. + double hide_probability = + data->hidden.size() > data->entries.size() * target_hidden_fraction + ? 1. / 3 + : 2. / 3; + if (data->hidden.empty()) { + hide_probability = 1; + } + bool do_hide = + rnd->Next() < std::numeric_limits::max() * hide_probability; + if (do_hide) { + // Hide a random entry. + size_t idx = rnd->Next() % data->entries.size(); + Entry& e = data->entries[idx]; + if (e.visible) { + if (trace) { + std::cout << "hiding idx " << idx << std::endl; + } + e.visible = false; + data->hidden.push_back(idx); + data->recently_touched_keys.insert(e.key); + } else { + // Already hidden. Let's go unhide something instead, just because + // it's easy and it doesn't really matter what we do. + do_hide = false; + } + } + if (!do_hide) { + // Unhide a random entry. + size_t hi = rnd->Next() % data->hidden.size(); + size_t idx = data->hidden[hi]; + if (trace) { + std::cout << "unhiding idx " << idx << std::endl; + } + Entry& e = data->entries[idx]; + assert(!e.visible); + e.visible = true; + data->hidden[hi] = data->hidden.back(); + data->hidden.pop_back(); + data->recently_touched_keys.insert(e.key); + } + } while (rnd->Next() % 3 != 0); // do 3 mutations on average + } + + void SkipForward() { + while (iter < (int)data->entries.size() && !data->entries[iter].visible) { + ++iter; + } + } + void SkipBackward() { + while (iter >= 0 && !data->entries[iter].visible) { + --iter; + } + } + + void SeekToFirst() override { + if (MaybeFail()) return; + MaybeMutate(); + + status_ = Status::OK(); + iter = 0; + SkipForward(); + } + void SeekToLast() override { + if (MaybeFail()) return; + MaybeMutate(); + + status_ = Status::OK(); + iter = (int)data->entries.size() - 1; + SkipBackward(); + } + + void Seek(const Slice& target) override { + if (MaybeFail()) return; + MaybeMutate(); + + status_ = Status::OK(); + // Binary search. + auto it = std::partition_point( + data->entries.begin(), data->entries.end(), + [&](const Entry& e) { return cmp.Compare(e.ikey, target) < 0; }); + iter = (int)(it - data->entries.begin()); + SkipForward(); + } + void SeekForPrev(const Slice& target) override { + if (MaybeFail()) return; + MaybeMutate(); + + status_ = Status::OK(); + // Binary search. + auto it = std::partition_point( + data->entries.begin(), data->entries.end(), + [&](const Entry& e) { return cmp.Compare(e.ikey, target) <= 0; }); + iter = (int)(it - data->entries.begin()); + --iter; + SkipBackward(); + } + + void Next() override { + assert(Valid()); + if (MaybeFail()) return; + MaybeMutate(); + ++iter; + SkipForward(); + } + void Prev() override { + assert(Valid()); + if (MaybeFail()) return; + MaybeMutate(); + --iter; + SkipBackward(); + } + + Slice key() const override { + assert(Valid()); + return data->entries[iter].ikey; + } + Slice value() const override { + assert(Valid()); + return data->entries[iter].value; + } + + bool IsKeyPinned() const override { return true; } + bool IsValuePinned() const override { return true; } +}; + +// A small reimplementation of DBIter, supporting only some of the features, +// and doing everything in O(log n). +// Skips all keys that are in recently_touched_keys. +struct ReferenceIterator { + Data* data; + uint64_t sequence; // ignore entries with sequence number below this + + bool valid = false; + std::string key; + std::string value; + + ReferenceIterator(Data* _data, uint64_t _sequence) + : data(_data), sequence(_sequence) {} + + bool Valid() const { return valid; } + + // Finds the first entry with key + // greater/less/greater-or-equal/less-or-equal than `key`, depending on + // arguments: if `skip`, inequality is strict; if `forward`, it's + // greater/greater-or-equal, otherwise less/less-or-equal. + // Sets `key` to the result. + // If no such key exists, returns false. Doesn't check `visible`. + bool FindNextKey(bool skip, bool forward) { + valid = false; + auto it = std::partition_point(data->entries.begin(), data->entries.end(), + [&](const Entry& e) { + if (forward != skip) { + return e.key < key; + } else { + return e.key <= key; + } + }); + if (forward) { + if (it != data->entries.end()) { + key = it->key; + return true; + } + } else { + if (it != data->entries.begin()) { + --it; + key = it->key; + return true; + } + } + return false; + } + + bool FindValueForCurrentKey() { + if (data->recently_touched_keys.count(key)) { + return false; + } + + // Find the first entry for the key. The caller promises that it exists. + auto it = std::partition_point(data->entries.begin(), data->entries.end(), + [&](const Entry& e) { + if (e.key != key) { + return e.key < key; + } + return e.sequence > sequence; + }); + + // Find the first visible entry. + for (;; ++it) { + if (it == data->entries.end()) { + return false; + } + Entry& e = *it; + if (e.key != key) { + return false; + } + assert(e.sequence <= sequence); + if (!e.visible) continue; + if (e.type == kTypeDeletion) { + return false; + } + if (e.type == kTypeValue) { + value = e.value; + valid = true; + return true; + } + assert(e.type == kTypeMerge); + break; + } + + // Collect merge operands. + std::vector operands; + for (; it != data->entries.end(); ++it) { + Entry& e = *it; + if (e.key != key) { + break; + } + assert(e.sequence <= sequence); + if (!e.visible) continue; + if (e.type == kTypeDeletion) { + break; + } + operands.push_back(e.value); + if (e.type == kTypeValue) { + break; + } + } + + // Do a merge. + value = operands.back().ToString(); + for (int i = (int)operands.size() - 2; i >= 0; --i) { + value.append(","); + value.append(operands[i].data(), operands[i].size()); + } + + valid = true; + return true; + } + + // Start at `key` and move until we encounter a valid value. + // `forward` defines the direction of movement. + // If `skip` is true, we're looking for key not equal to `key`. + void DoTheThing(bool skip, bool forward) { + while (FindNextKey(skip, forward) && !FindValueForCurrentKey()) { + skip = true; + } + } + + void Seek(const Slice& target) { + key = target.ToString(); + DoTheThing(false, true); + } + void SeekForPrev(const Slice& target) { + key = target.ToString(); + DoTheThing(false, false); + } + void SeekToFirst() { Seek(""); } + void SeekToLast() { + key = data->entries.back().key; + DoTheThing(false, false); + } + void Next() { + assert(Valid()); + DoTheThing(true, true); + } + void Prev() { + assert(Valid()); + DoTheThing(true, false); + } +}; + +} // namespace + +// Use an internal iterator that sometimes returns errors and sometimes +// adds/removes entries on the fly. Do random operations on a DBIter and +// check results. +// TODO: can be improved for more coverage: +// * Override IsKeyPinned() and IsValuePinned() to actually use +// PinnedIteratorManager and check that there's no use-after free. +// * Try different combinations of prefix_extractor, total_order_seek, +// prefix_same_as_start, iterate_lower_bound, iterate_upper_bound. +TEST_F(DBIteratorStressTest, StressTest) { + // We use a deterministic RNG, and everything happens in a single thread. + Random64 rnd(826909345792864532ll); + + auto gen_key = [&](int max_key) { + assert(max_key > 0); + int len = 0; + int a = max_key; + while (a) { + a /= 10; + ++len; + } + std::string s = ToString(rnd.Next() % static_cast(max_key)); + s.insert(0, len - (int)s.size(), '0'); + return s; + }; + + Options options; + options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); + ReadOptions ropt; + + size_t num_matching = 0; + size_t num_at_end = 0; + size_t num_not_ok = 0; + size_t num_recently_removed = 0; + + // Number of iterations for each combination of parameters + // (there are ~250 of those). + // Tweak this to change the test run time. + // As of the time of writing, the test takes ~4 seconds for value of 5000. + const int num_iterations = 5000; + // Enable this to print all the operations for debugging. + bool trace = FLAGS_verbose; + + for (int num_entries : {5, 10, 100}) { + for (double key_space : {0.1, 1.0, 3.0}) { + for (ValueType prevalent_entry_type : + {kTypeValue, kTypeDeletion, kTypeMerge}) { + for (double error_probability : {0.01, 0.1}) { + for (double mutation_probability : {0.01, 0.5}) { + for (double target_hidden_fraction : {0.1, 0.5}) { + std::string trace_str = + "entries: " + ToString(num_entries) + + ", key_space: " + ToString(key_space) + + ", error_probability: " + ToString(error_probability) + + ", mutation_probability: " + ToString(mutation_probability) + + ", target_hidden_fraction: " + + ToString(target_hidden_fraction); + SCOPED_TRACE(trace_str); + if (trace) { + std::cout << trace_str << std::endl; + } + + // Generate data. + Data data; + int max_key = (int)(num_entries * key_space) + 1; + for (int i = 0; i < num_entries; ++i) { + Entry e; + e.key = gen_key(max_key); + if (rnd.Next() % 10 != 0) { + e.type = prevalent_entry_type; + } else { + const ValueType types[] = {kTypeValue, kTypeDeletion, + kTypeMerge}; + e.type = + types[rnd.Next() % (sizeof(types) / sizeof(types[0]))]; + } + e.sequence = i; + e.value = "v" + ToString(i); + ParsedInternalKey internal_key(e.key, e.sequence, e.type); + AppendInternalKey(&e.ikey, internal_key); + + data.entries.push_back(e); + } + std::sort(data.entries.begin(), data.entries.end()); + if (trace) { + std::cout << "entries:"; + for (size_t i = 0; i < data.entries.size(); ++i) { + Entry& e = data.entries[i]; + std::cout + << "\n idx " << i << ": \"" << e.key << "\": \"" + << e.value << "\" seq: " << e.sequence << " type: " + << (e.type == kTypeValue + ? "val" + : e.type == kTypeDeletion ? "del" : "merge"); + } + std::cout << std::endl; + } + + std::unique_ptr db_iter; + std::unique_ptr ref_iter; + for (int iteration = 0; iteration < num_iterations; ++iteration) { + SCOPED_TRACE(iteration); + // Create a new iterator every ~30 operations. + if (db_iter == nullptr || rnd.Next() % 30 == 0) { + uint64_t sequence = rnd.Next() % (data.entries.size() + 2); + ref_iter.reset(new ReferenceIterator(&data, sequence)); + if (trace) { + std::cout << "new iterator, seq: " << sequence << std::endl; + } + + auto internal_iter = + new StressTestIterator(&data, &rnd, BytewiseComparator()); + internal_iter->error_probability = error_probability; + internal_iter->mutation_probability = mutation_probability; + internal_iter->target_hidden_fraction = + target_hidden_fraction; + internal_iter->trace = trace; + db_iter.reset(NewDBIterator( + env_, ropt, ImmutableCFOptions(options), + MutableCFOptions(options), BytewiseComparator(), + internal_iter, sequence, + options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + } + + // Do a random operation. It's important to do it on ref_it + // later than on db_iter to make sure ref_it sees the correct + // recently_touched_keys. + std::string old_key; + bool forward = rnd.Next() % 2 > 0; + // Do Next()/Prev() ~90% of the time. + bool seek = !ref_iter->Valid() || rnd.Next() % 10 == 0; + if (trace) { + std::cout << iteration << ": "; + } + + if (!seek) { + assert(db_iter->Valid()); + old_key = ref_iter->key; + if (trace) { + std::cout << (forward ? "Next" : "Prev") << std::endl; + } + + if (forward) { + db_iter->Next(); + ref_iter->Next(); + } else { + db_iter->Prev(); + ref_iter->Prev(); + } + } else { + data.recently_touched_keys.clear(); + // Do SeekToFirst less often than Seek. + if (rnd.Next() % 4 == 0) { + if (trace) { + std::cout << (forward ? "SeekToFirst" : "SeekToLast") + << std::endl; + } + + if (forward) { + old_key = ""; + db_iter->SeekToFirst(); + ref_iter->SeekToFirst(); + } else { + old_key = data.entries.back().key; + db_iter->SeekToLast(); + ref_iter->SeekToLast(); + } + } else { + old_key = gen_key(max_key); + if (trace) { + std::cout << (forward ? "Seek" : "SeekForPrev") << " \"" + << old_key << '"' << std::endl; + } + if (forward) { + db_iter->Seek(old_key); + ref_iter->Seek(old_key); + } else { + db_iter->SeekForPrev(old_key); + ref_iter->SeekForPrev(old_key); + } + } + } + + // Check the result. + if (db_iter->Valid()) { + ASSERT_TRUE(db_iter->status().ok()); + if (data.recently_touched_keys.count( + db_iter->key().ToString())) { + // Ended on a key that may have been mutated during the + // operation. Reference iterator skips such keys, so we + // can't check the exact result. + + // Check that the key moved in the right direction. + if (forward) { + if (seek) + ASSERT_GE(db_iter->key().ToString(), old_key); + else + ASSERT_GT(db_iter->key().ToString(), old_key); + } else { + if (seek) + ASSERT_LE(db_iter->key().ToString(), old_key); + else + ASSERT_LT(db_iter->key().ToString(), old_key); + } + + if (ref_iter->Valid()) { + // Check that DBIter didn't miss any non-mutated key. + if (forward) { + ASSERT_LT(db_iter->key().ToString(), ref_iter->key); + } else { + ASSERT_GT(db_iter->key().ToString(), ref_iter->key); + } + } + // Tell the next iteration of the loop to reseek the + // iterators. + ref_iter->valid = false; + + ++num_recently_removed; + } else { + ASSERT_TRUE(ref_iter->Valid()); + ASSERT_EQ(ref_iter->key, db_iter->key().ToString()); + ASSERT_EQ(ref_iter->value, db_iter->value()); + ++num_matching; + } + } else if (db_iter->status().ok()) { + ASSERT_FALSE(ref_iter->Valid()); + ++num_at_end; + } else { + // Non-ok status. Nothing to check here. + // Tell the next iteration of the loop to reseek the + // iterators. + ref_iter->valid = false; + ++num_not_ok; + } + } + } + } + } + } + } + } + + // Check that all cases were hit many times. + EXPECT_GT(num_matching, 10000); + EXPECT_GT(num_at_end, 10000); + EXPECT_GT(num_not_ok, 10000); + EXPECT_GT(num_recently_removed, 10000); + + std::cout << "stats:\n exact matches: " << num_matching + << "\n end reached: " << num_at_end + << "\n non-ok status: " << num_not_ok + << "\n mutated on the fly: " << num_recently_removed << std::endl; +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + ParseCommandLineFlags(&argc, &argv, true); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_iter_test.cc b/src/rocksdb/db/db_iter_test.cc new file mode 100644 index 000000000..ddbea8d17 --- /dev/null +++ b/src/rocksdb/db/db_iter_test.cc @@ -0,0 +1,3175 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include +#include +#include +#include + +#include "db/db_iter.h" +#include "db/dbformat.h" +#include "rocksdb/comparator.h" +#include "rocksdb/options.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/slice.h" +#include "rocksdb/statistics.h" +#include "table/iterator_wrapper.h" +#include "table/merging_iterator.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "util/string_util.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { + +static uint64_t TestGetTickerCount(const Options& options, + Tickers ticker_type) { + return options.statistics->getTickerCount(ticker_type); +} + +class TestIterator : public InternalIterator { + public: + explicit TestIterator(const Comparator* comparator) + : initialized_(false), + valid_(false), + sequence_number_(0), + iter_(0), + cmp(comparator) { + data_.reserve(16); + } + + void AddPut(std::string argkey, std::string argvalue) { + Add(argkey, kTypeValue, argvalue); + } + + void AddDeletion(std::string argkey) { + Add(argkey, kTypeDeletion, std::string()); + } + + void AddSingleDeletion(std::string argkey) { + Add(argkey, kTypeSingleDeletion, std::string()); + } + + void AddMerge(std::string argkey, std::string argvalue) { + Add(argkey, kTypeMerge, argvalue); + } + + void Add(std::string argkey, ValueType type, std::string argvalue) { + Add(argkey, type, argvalue, sequence_number_++); + } + + void Add(std::string argkey, ValueType type, std::string argvalue, + size_t seq_num, bool update_iter = false) { + valid_ = true; + ParsedInternalKey internal_key(argkey, seq_num, type); + data_.push_back( + std::pair(std::string(), argvalue)); + AppendInternalKey(&data_.back().first, internal_key); + if (update_iter && valid_ && cmp.Compare(data_.back().first, key()) < 0) { + // insert a key smaller than current key + Finish(); + // data_[iter_] is not anymore the current element of the iterator. + // Increment it to reposition it to the right position. + iter_++; + } + } + + // should be called before operations with iterator + void Finish() { + initialized_ = true; + std::sort(data_.begin(), data_.end(), + [this](std::pair a, + std::pair b) { + return (cmp.Compare(a.first, b.first) < 0); + }); + } + + // Removes the key from the set of keys over which this iterator iterates. + // Not to be confused with AddDeletion(). + // If the iterator is currently positioned on this key, the deletion will + // apply next time the iterator moves. + // Used for simulating ForwardIterator updating to a new version that doesn't + // have some of the keys (e.g. after compaction with a filter). + void Vanish(std::string _key) { + if (valid_ && data_[iter_].first == _key) { + delete_current_ = true; + return; + } + for (auto it = data_.begin(); it != data_.end(); ++it) { + ParsedInternalKey ikey; + bool ok __attribute__((__unused__)) = ParseInternalKey(it->first, &ikey); + assert(ok); + if (ikey.user_key != _key) { + continue; + } + if (valid_ && data_.begin() + iter_ > it) { + --iter_; + } + data_.erase(it); + return; + } + assert(false); + } + + // Number of operations done on this iterator since construction. + size_t steps() const { return steps_; } + + bool Valid() const override { + assert(initialized_); + return valid_; + } + + void SeekToFirst() override { + assert(initialized_); + ++steps_; + DeleteCurrentIfNeeded(); + valid_ = (data_.size() > 0); + iter_ = 0; + } + + void SeekToLast() override { + assert(initialized_); + ++steps_; + DeleteCurrentIfNeeded(); + valid_ = (data_.size() > 0); + iter_ = data_.size() - 1; + } + + void Seek(const Slice& target) override { + assert(initialized_); + SeekToFirst(); + ++steps_; + if (!valid_) { + return; + } + while (iter_ < data_.size() && + (cmp.Compare(data_[iter_].first, target) < 0)) { + ++iter_; + } + + if (iter_ == data_.size()) { + valid_ = false; + } + } + + void SeekForPrev(const Slice& target) override { + assert(initialized_); + DeleteCurrentIfNeeded(); + SeekForPrevImpl(target, &cmp); + } + + void Next() override { + assert(initialized_); + assert(valid_); + assert(iter_ < data_.size()); + + ++steps_; + if (delete_current_) { + DeleteCurrentIfNeeded(); + } else { + ++iter_; + } + valid_ = iter_ < data_.size(); + } + + void Prev() override { + assert(initialized_); + assert(valid_); + assert(iter_ < data_.size()); + + ++steps_; + DeleteCurrentIfNeeded(); + if (iter_ == 0) { + valid_ = false; + } else { + --iter_; + } + } + + Slice key() const override { + assert(initialized_); + return data_[iter_].first; + } + + Slice value() const override { + assert(initialized_); + return data_[iter_].second; + } + + Status status() const override { + assert(initialized_); + return Status::OK(); + } + + bool IsKeyPinned() const override { return true; } + bool IsValuePinned() const override { return true; } + + private: + bool initialized_; + bool valid_; + size_t sequence_number_; + size_t iter_; + size_t steps_ = 0; + + InternalKeyComparator cmp; + std::vector> data_; + bool delete_current_ = false; + + void DeleteCurrentIfNeeded() { + if (!delete_current_) { + return; + } + data_.erase(data_.begin() + iter_); + delete_current_ = false; + } +}; + +class DBIteratorTest : public testing::Test { + public: + Env* env_; + + DBIteratorTest() : env_(Env::Default()) {} +}; + +TEST_F(DBIteratorTest, DBIteratorPrevNext) { + Options options; + ImmutableCFOptions cf_options = ImmutableCFOptions(options); + MutableCFOptions mutable_cf_options = MutableCFOptions(options); + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddDeletion("a"); + internal_iter->AddDeletion("a"); + internal_iter->AddDeletion("a"); + internal_iter->AddDeletion("a"); + internal_iter->AddPut("a", "val_a"); + + internal_iter->AddPut("b", "val_b"); + internal_iter->Finish(); + + ReadOptions ro; + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 10, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "val_b"); + + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "val_a"); + + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "val_b"); + + db_iter->Next(); + ASSERT_TRUE(!db_iter->Valid()); + } + // Test to check the SeekToLast() with iterate_upper_bound not set + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("b", "val_b"); + internal_iter->AddPut("b", "val_b"); + internal_iter->AddPut("c", "val_c"); + internal_iter->Finish(); + + ReadOptions ro; + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 10, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + } + + // Test to check the SeekToLast() with iterate_upper_bound set + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("b", "val_b"); + internal_iter->AddPut("c", "val_c"); + internal_iter->AddPut("d", "val_d"); + internal_iter->AddPut("e", "val_e"); + internal_iter->AddPut("f", "val_f"); + internal_iter->Finish(); + + Slice prefix("d"); + + ReadOptions ro; + ro.iterate_upper_bound = &prefix; + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 10, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + + db_iter->Next(); + ASSERT_TRUE(!db_iter->Valid()); + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + } + // Test to check the SeekToLast() iterate_upper_bound set to a key that + // is not Put yet + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("b", "val_b"); + internal_iter->AddPut("c", "val_c"); + internal_iter->AddPut("d", "val_d"); + internal_iter->Finish(); + + Slice prefix("z"); + + ReadOptions ro; + ro.iterate_upper_bound = &prefix; + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 10, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "d"); + + db_iter->Next(); + ASSERT_TRUE(!db_iter->Valid()); + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "d"); + + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + } + // Test to check the SeekToLast() with iterate_upper_bound set to the + // first key + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("b", "val_b"); + internal_iter->AddPut("b", "val_b"); + internal_iter->Finish(); + + Slice prefix("a"); + + ReadOptions ro; + ro.iterate_upper_bound = &prefix; + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 10, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + + db_iter->SeekToLast(); + ASSERT_TRUE(!db_iter->Valid()); + } + // Test case to check SeekToLast with iterate_upper_bound set + // (same key put may times - SeekToLast should start with the + // maximum sequence id of the upper bound) + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("b", "val_b"); + internal_iter->AddPut("c", "val_c"); + internal_iter->AddPut("c", "val_c"); + internal_iter->AddPut("c", "val_c"); + internal_iter->AddPut("c", "val_c"); + internal_iter->AddPut("c", "val_c"); + internal_iter->AddPut("c", "val_c"); + internal_iter->AddPut("c", "val_c"); + internal_iter->Finish(); + + Slice prefix("c"); + + ReadOptions ro; + ro.iterate_upper_bound = &prefix; + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 7, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + + SetPerfLevel(kEnableCount); + ASSERT_TRUE(GetPerfLevel() == kEnableCount); + + get_perf_context()->Reset(); + db_iter->SeekToLast(); + + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(static_cast(get_perf_context()->internal_key_skipped_count), 1); + ASSERT_EQ(db_iter->key().ToString(), "b"); + + SetPerfLevel(kDisable); + } + // Test to check the SeekToLast() with the iterate_upper_bound set + // (Checking the value of the key which has sequence ids greater than + // and less that the iterator's sequence id) + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + + internal_iter->AddPut("a", "val_a1"); + internal_iter->AddPut("a", "val_a2"); + internal_iter->AddPut("b", "val_b1"); + internal_iter->AddPut("c", "val_c1"); + internal_iter->AddPut("c", "val_c2"); + internal_iter->AddPut("c", "val_c3"); + internal_iter->AddPut("b", "val_b2"); + internal_iter->AddPut("d", "val_d1"); + internal_iter->Finish(); + + Slice prefix("c"); + + ReadOptions ro; + ro.iterate_upper_bound = &prefix; + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 4, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "val_b1"); + } + + // Test to check the SeekToLast() with the iterate_upper_bound set to the + // key that is deleted + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddDeletion("a"); + internal_iter->AddPut("b", "val_b"); + internal_iter->AddPut("c", "val_c"); + internal_iter->Finish(); + + Slice prefix("a"); + + ReadOptions ro; + ro.iterate_upper_bound = &prefix; + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 10, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + + db_iter->SeekToLast(); + ASSERT_TRUE(!db_iter->Valid()); + } + // Test to check the SeekToLast() with the iterate_upper_bound set + // (Deletion cases) + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("b", "val_b"); + internal_iter->AddDeletion("b"); + internal_iter->AddPut("c", "val_c"); + internal_iter->Finish(); + + Slice prefix("c"); + + ReadOptions ro; + ro.iterate_upper_bound = &prefix; + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 10, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + + db_iter->Next(); + ASSERT_TRUE(!db_iter->Valid()); + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + } + // Test to check the SeekToLast() with iterate_upper_bound set + // (Deletion cases - Lot of internal keys after the upper_bound + // is deleted) + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("b", "val_b"); + internal_iter->AddDeletion("c"); + internal_iter->AddDeletion("d"); + internal_iter->AddDeletion("e"); + internal_iter->AddDeletion("f"); + internal_iter->AddDeletion("g"); + internal_iter->AddDeletion("h"); + internal_iter->Finish(); + + Slice prefix("c"); + + ReadOptions ro; + ro.iterate_upper_bound = &prefix; + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 7, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + + SetPerfLevel(kEnableCount); + ASSERT_TRUE(GetPerfLevel() == kEnableCount); + + get_perf_context()->Reset(); + db_iter->SeekToLast(); + + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(static_cast(get_perf_context()->internal_delete_skipped_count), 0); + ASSERT_EQ(db_iter->key().ToString(), "b"); + + SetPerfLevel(kDisable); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddDeletion("a"); + internal_iter->AddDeletion("a"); + internal_iter->AddDeletion("a"); + internal_iter->AddDeletion("a"); + internal_iter->AddPut("a", "val_a"); + + internal_iter->AddPut("b", "val_b"); + internal_iter->Finish(); + + ReadOptions ro; + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 10, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + + db_iter->SeekToFirst(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "val_a"); + + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "val_b"); + + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "val_a"); + + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("b", "val_b"); + + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("b", "val_b"); + + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("b", "val_b"); + + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("b", "val_b"); + + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("b", "val_b"); + internal_iter->Finish(); + + ReadOptions ro; + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 2, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "val_b"); + + db_iter->Next(); + ASSERT_TRUE(!db_iter->Valid()); + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "val_b"); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("a", "val_a"); + + internal_iter->AddPut("b", "val_b"); + + internal_iter->AddPut("c", "val_c"); + internal_iter->Finish(); + + ReadOptions ro; + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 10, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "val_c"); + + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "val_b"); + + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "val_c"); + } +} + +TEST_F(DBIteratorTest, DBIteratorEmpty) { + Options options; + ImmutableCFOptions cf_options = ImmutableCFOptions(options); + MutableCFOptions mutable_cf_options = MutableCFOptions(options); + ReadOptions ro; + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 0, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + db_iter->SeekToLast(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 0, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + db_iter->SeekToFirst(); + ASSERT_TRUE(!db_iter->Valid()); + } +} + +TEST_F(DBIteratorTest, DBIteratorUseSkipCountSkips) { + ReadOptions ro; + Options options; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); + + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + for (size_t i = 0; i < 200; ++i) { + internal_iter->AddPut("a", "a"); + internal_iter->AddPut("b", "b"); + internal_iter->AddPut("c", "c"); + } + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, 2, + options.max_sequential_skip_in_iterations, nullptr /*read_callback*/)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "c"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1u); + + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "b"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2u); + + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "a"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3u); + + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3u); +} + +TEST_F(DBIteratorTest, DBIteratorUseSkip) { + ReadOptions ro; + Options options; + options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); + ImmutableCFOptions cf_options = ImmutableCFOptions(options); + MutableCFOptions mutable_cf_options = MutableCFOptions(options); + + { + for (size_t i = 0; i < 200; ++i) { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("b", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + for (size_t k = 0; k < 200; ++k) { + internal_iter->AddPut("c", ToString(k)); + } + internal_iter->Finish(); + + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, i + 2, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), ToString(i)); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_2"); + db_iter->Prev(); + + ASSERT_TRUE(!db_iter->Valid()); + } + } + + { + for (size_t i = 0; i < 200; ++i) { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("b", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + for (size_t k = 0; k < 200; ++k) { + internal_iter->AddDeletion("c"); + } + internal_iter->AddPut("c", "200"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, i + 2, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_2"); + db_iter->Prev(); + + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("b", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + for (size_t i = 0; i < 200; ++i) { + internal_iter->AddDeletion("c"); + } + internal_iter->AddPut("c", "200"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 202, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "200"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_2"); + db_iter->Prev(); + + ASSERT_TRUE(!db_iter->Valid()); + } + } + + { + for (size_t i = 0; i < 200; ++i) { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + for (size_t k = 0; k < 200; ++k) { + internal_iter->AddDeletion("c"); + } + internal_iter->AddPut("c", "200"); + internal_iter->Finish(); + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, i, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + db_iter->SeekToLast(); + ASSERT_TRUE(!db_iter->Valid()); + + db_iter->SeekToFirst(); + ASSERT_TRUE(!db_iter->Valid()); + } + + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + for (size_t i = 0; i < 200; ++i) { + internal_iter->AddDeletion("c"); + } + internal_iter->AddPut("c", "200"); + internal_iter->Finish(); + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 200, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "200"); + + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + + db_iter->SeekToFirst(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "200"); + + db_iter->Next(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + for (size_t i = 0; i < 200; ++i) { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("b", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + for (size_t k = 0; k < 200; ++k) { + internal_iter->AddPut("d", ToString(k)); + } + + for (size_t k = 0; k < 200; ++k) { + internal_iter->AddPut("c", ToString(k)); + } + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, i + 2, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "d"); + ASSERT_EQ(db_iter->value().ToString(), ToString(i)); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_2"); + db_iter->Prev(); + + ASSERT_TRUE(!db_iter->Valid()); + } + } + + { + for (size_t i = 0; i < 200; ++i) { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("b", "b"); + internal_iter->AddMerge("a", "a"); + for (size_t k = 0; k < 200; ++k) { + internal_iter->AddMerge("c", ToString(k)); + } + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, i + 2, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "c"); + std::string merge_result = "0"; + for (size_t j = 1; j <= i; ++j) { + merge_result += "," + ToString(j); + } + ASSERT_EQ(db_iter->value().ToString(), merge_result); + + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "b"); + + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "a"); + + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + } +} + +TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) { + Options options; + ImmutableCFOptions cf_options = ImmutableCFOptions(options); + MutableCFOptions mutable_cf_options = MutableCFOptions(options); + ReadOptions ro; + + // Basic test case ... Make sure explicityly passing the default value works. + // Skipping internal keys is disabled by default, when the value is 0. + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddDeletion("b"); + internal_iter->AddDeletion("b"); + internal_iter->AddPut("c", "val_c"); + internal_iter->AddPut("c", "val_c"); + internal_iter->AddDeletion("c"); + internal_iter->AddPut("d", "val_d"); + internal_iter->Finish(); + + ro.max_skippable_internal_keys = 0; + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 10, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + + db_iter->SeekToFirst(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "val_a"); + + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "d"); + ASSERT_EQ(db_iter->value().ToString(), "val_d"); + + db_iter->Next(); + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_TRUE(db_iter->status().ok()); + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "d"); + ASSERT_EQ(db_iter->value().ToString(), "val_d"); + + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "val_a"); + + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_TRUE(db_iter->status().ok()); + } + + // Test to make sure that the request will *not* fail as incomplete if + // num_internal_keys_skipped is *equal* to max_skippable_internal_keys + // threshold. (It will fail as incomplete only when the threshold is + // exceeded.) + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddDeletion("b"); + internal_iter->AddDeletion("b"); + internal_iter->AddPut("c", "val_c"); + internal_iter->Finish(); + + ro.max_skippable_internal_keys = 2; + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 10, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + + db_iter->SeekToFirst(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "val_a"); + + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "val_c"); + + db_iter->Next(); + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_TRUE(db_iter->status().ok()); + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "val_c"); + + db_iter->Prev(); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "val_a"); + + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_TRUE(db_iter->status().ok()); + } + + // Fail the request as incomplete when num_internal_keys_skipped > + // max_skippable_internal_keys + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddDeletion("b"); + internal_iter->AddDeletion("b"); + internal_iter->AddDeletion("b"); + internal_iter->AddPut("c", "val_c"); + internal_iter->Finish(); + + ro.max_skippable_internal_keys = 2; + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 10, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + + db_iter->SeekToFirst(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "val_a"); + + db_iter->Next(); + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_TRUE(db_iter->status().IsIncomplete()); + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "val_c"); + + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_TRUE(db_iter->status().IsIncomplete()); + } + + // Test that the num_internal_keys_skipped counter resets after a successful + // read. + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddDeletion("b"); + internal_iter->AddDeletion("b"); + internal_iter->AddPut("c", "val_c"); + internal_iter->AddDeletion("d"); + internal_iter->AddDeletion("d"); + internal_iter->AddDeletion("d"); + internal_iter->AddPut("e", "val_e"); + internal_iter->Finish(); + + ro.max_skippable_internal_keys = 2; + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 10, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + + db_iter->SeekToFirst(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "val_a"); + + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "val_c"); + + db_iter->Next(); // num_internal_keys_skipped counter resets here. + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_TRUE(db_iter->status().IsIncomplete()); + } + + // Test that the num_internal_keys_skipped counter resets after a successful + // read. + // Reverse direction + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddDeletion("b"); + internal_iter->AddDeletion("b"); + internal_iter->AddDeletion("b"); + internal_iter->AddPut("c", "val_c"); + internal_iter->AddDeletion("d"); + internal_iter->AddDeletion("d"); + internal_iter->AddPut("e", "val_e"); + internal_iter->Finish(); + + ro.max_skippable_internal_keys = 2; + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 10, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "e"); + ASSERT_EQ(db_iter->value().ToString(), "val_e"); + + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "val_c"); + + db_iter->Prev(); // num_internal_keys_skipped counter resets here. + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_TRUE(db_iter->status().IsIncomplete()); + } + + // Test that skipping separate keys is handled + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddDeletion("b"); + internal_iter->AddDeletion("c"); + internal_iter->AddDeletion("d"); + internal_iter->AddPut("e", "val_e"); + internal_iter->Finish(); + + ro.max_skippable_internal_keys = 2; + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 10, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + + db_iter->SeekToFirst(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "val_a"); + + db_iter->Next(); + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_TRUE(db_iter->status().IsIncomplete()); + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "e"); + ASSERT_EQ(db_iter->value().ToString(), "val_e"); + + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_TRUE(db_iter->status().IsIncomplete()); + } + + // Test if alternating puts and deletes of the same key are handled correctly. + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddPut("b", "val_b"); + internal_iter->AddDeletion("b"); + internal_iter->AddPut("c", "val_c"); + internal_iter->AddDeletion("c"); + internal_iter->AddPut("d", "val_d"); + internal_iter->AddDeletion("d"); + internal_iter->AddPut("e", "val_e"); + internal_iter->Finish(); + + ro.max_skippable_internal_keys = 2; + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 10, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + + db_iter->SeekToFirst(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "val_a"); + + db_iter->Next(); + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_TRUE(db_iter->status().IsIncomplete()); + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "e"); + ASSERT_EQ(db_iter->value().ToString(), "val_e"); + + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_TRUE(db_iter->status().IsIncomplete()); + } + + // Test for large number of skippable internal keys with *default* + // max_sequential_skip_in_iterations. + { + for (size_t i = 1; i <= 200; ++i) { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "val_a"); + for (size_t j = 1; j <= i; ++j) { + internal_iter->AddPut("b", "val_b"); + internal_iter->AddDeletion("b"); + } + internal_iter->AddPut("c", "val_c"); + internal_iter->Finish(); + + ro.max_skippable_internal_keys = i; + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 2 * i + 1, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + + db_iter->SeekToFirst(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "val_a"); + + db_iter->Next(); + if ((options.max_sequential_skip_in_iterations + 1) >= + ro.max_skippable_internal_keys) { + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_TRUE(db_iter->status().IsIncomplete()); + } else { + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "val_c"); + } + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "val_c"); + + db_iter->Prev(); + if ((options.max_sequential_skip_in_iterations + 1) >= + ro.max_skippable_internal_keys) { + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_TRUE(db_iter->status().IsIncomplete()); + } else { + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "val_a"); + } + } + } + + // Test for large number of skippable internal keys with a *non-default* + // max_sequential_skip_in_iterations. + { + for (size_t i = 1; i <= 200; ++i) { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "val_a"); + for (size_t j = 1; j <= i; ++j) { + internal_iter->AddPut("b", "val_b"); + internal_iter->AddDeletion("b"); + } + internal_iter->AddPut("c", "val_c"); + internal_iter->Finish(); + + options.max_sequential_skip_in_iterations = 1000; + ro.max_skippable_internal_keys = i; + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 2 * i + 1, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + + db_iter->SeekToFirst(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "val_a"); + + db_iter->Next(); + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_TRUE(db_iter->status().IsIncomplete()); + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "val_c"); + + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + ASSERT_TRUE(db_iter->status().IsIncomplete()); + } + } +} + +TEST_F(DBIteratorTest, DBIterator1) { + ReadOptions ro; + Options options; + options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); + + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "0"); + internal_iter->AddPut("b", "0"); + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("a", "1"); + internal_iter->AddMerge("b", "2"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, 1, + options.max_sequential_skip_in_iterations, nullptr /*read_callback*/)); + db_iter->SeekToFirst(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "0"); + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + db_iter->Next(); + ASSERT_FALSE(db_iter->Valid()); +} + +TEST_F(DBIteratorTest, DBIterator2) { + ReadOptions ro; + Options options; + options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); + + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "0"); + internal_iter->AddPut("b", "0"); + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("a", "1"); + internal_iter->AddMerge("b", "2"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, 0, + options.max_sequential_skip_in_iterations, nullptr /*read_callback*/)); + db_iter->SeekToFirst(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "0"); + db_iter->Next(); + ASSERT_TRUE(!db_iter->Valid()); +} + +TEST_F(DBIteratorTest, DBIterator3) { + ReadOptions ro; + Options options; + options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); + + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "0"); + internal_iter->AddPut("b", "0"); + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("a", "1"); + internal_iter->AddMerge("b", "2"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, 2, + options.max_sequential_skip_in_iterations, nullptr /*read_callback*/)); + db_iter->SeekToFirst(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "0"); + db_iter->Next(); + ASSERT_TRUE(!db_iter->Valid()); +} + +TEST_F(DBIteratorTest, DBIterator4) { + ReadOptions ro; + Options options; + options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); + + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "0"); + internal_iter->AddPut("b", "0"); + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("a", "1"); + internal_iter->AddMerge("b", "2"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, 4, + options.max_sequential_skip_in_iterations, nullptr /*read_callback*/)); + db_iter->SeekToFirst(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "0,1"); + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "2"); + db_iter->Next(); + ASSERT_TRUE(!db_iter->Valid()); +} + +TEST_F(DBIteratorTest, DBIterator5) { + ReadOptions ro; + Options options; + options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); + ImmutableCFOptions cf_options = ImmutableCFOptions(options); + MutableCFOptions mutable_cf_options = MutableCFOptions(options); + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddPut("a", "put_1"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 0, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddPut("a", "put_1"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 1, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddPut("a", "put_1"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 2, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2,merge_3"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddPut("a", "put_1"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 3, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "put_1"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddPut("a", "put_1"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 4, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddPut("a", "put_1"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 5, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4,merge_5"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddPut("a", "put_1"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 6, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4,merge_5,merge_6"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + // put, singledelete, merge + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "val_a"); + internal_iter->AddSingleDeletion("a"); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddPut("b", "val_b"); + internal_iter->Finish(); + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 10, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + db_iter->Seek("b"); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + } +} + +TEST_F(DBIteratorTest, DBIterator6) { + ReadOptions ro; + Options options; + options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); + ImmutableCFOptions cf_options = ImmutableCFOptions(options); + MutableCFOptions mutable_cf_options = MutableCFOptions(options); + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddDeletion("a"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 0, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddDeletion("a"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 1, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddDeletion("a"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 2, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2,merge_3"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddDeletion("a"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 3, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + db_iter->SeekToLast(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddDeletion("a"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 4, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_4"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddDeletion("a"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 5, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddDeletion("a"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 6, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5,merge_6"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } +} + +TEST_F(DBIteratorTest, DBIterator7) { + ReadOptions ro; + Options options; + options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); + ImmutableCFOptions cf_options = ImmutableCFOptions(options); + MutableCFOptions mutable_cf_options = MutableCFOptions(options); + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddPut("b", "val"); + internal_iter->AddMerge("b", "merge_2"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_3"); + + internal_iter->AddMerge("c", "merge_4"); + internal_iter->AddMerge("c", "merge_5"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_6"); + internal_iter->AddMerge("b", "merge_7"); + internal_iter->AddMerge("b", "merge_8"); + internal_iter->AddMerge("b", "merge_9"); + internal_iter->AddMerge("b", "merge_10"); + internal_iter->AddMerge("b", "merge_11"); + + internal_iter->AddDeletion("c"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 0, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddPut("b", "val"); + internal_iter->AddMerge("b", "merge_2"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_3"); + + internal_iter->AddMerge("c", "merge_4"); + internal_iter->AddMerge("c", "merge_5"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_6"); + internal_iter->AddMerge("b", "merge_7"); + internal_iter->AddMerge("b", "merge_8"); + internal_iter->AddMerge("b", "merge_9"); + internal_iter->AddMerge("b", "merge_10"); + internal_iter->AddMerge("b", "merge_11"); + + internal_iter->AddDeletion("c"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 2, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "val,merge_2"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddPut("b", "val"); + internal_iter->AddMerge("b", "merge_2"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_3"); + + internal_iter->AddMerge("c", "merge_4"); + internal_iter->AddMerge("c", "merge_5"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_6"); + internal_iter->AddMerge("b", "merge_7"); + internal_iter->AddMerge("b", "merge_8"); + internal_iter->AddMerge("b", "merge_9"); + internal_iter->AddMerge("b", "merge_10"); + internal_iter->AddMerge("b", "merge_11"); + + internal_iter->AddDeletion("c"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 4, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "merge_3"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddPut("b", "val"); + internal_iter->AddMerge("b", "merge_2"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_3"); + + internal_iter->AddMerge("c", "merge_4"); + internal_iter->AddMerge("c", "merge_5"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_6"); + internal_iter->AddMerge("b", "merge_7"); + internal_iter->AddMerge("b", "merge_8"); + internal_iter->AddMerge("b", "merge_9"); + internal_iter->AddMerge("b", "merge_10"); + internal_iter->AddMerge("b", "merge_11"); + + internal_iter->AddDeletion("c"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 5, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "merge_4"); + db_iter->Prev(); + + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "merge_3"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddPut("b", "val"); + internal_iter->AddMerge("b", "merge_2"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_3"); + + internal_iter->AddMerge("c", "merge_4"); + internal_iter->AddMerge("c", "merge_5"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_6"); + internal_iter->AddMerge("b", "merge_7"); + internal_iter->AddMerge("b", "merge_8"); + internal_iter->AddMerge("b", "merge_9"); + internal_iter->AddMerge("b", "merge_10"); + internal_iter->AddMerge("b", "merge_11"); + + internal_iter->AddDeletion("c"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 6, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "merge_3"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddPut("b", "val"); + internal_iter->AddMerge("b", "merge_2"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_3"); + + internal_iter->AddMerge("c", "merge_4"); + internal_iter->AddMerge("c", "merge_5"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_6"); + internal_iter->AddMerge("b", "merge_7"); + internal_iter->AddMerge("b", "merge_8"); + internal_iter->AddMerge("b", "merge_9"); + internal_iter->AddMerge("b", "merge_10"); + internal_iter->AddMerge("b", "merge_11"); + + internal_iter->AddDeletion("c"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 7, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddPut("b", "val"); + internal_iter->AddMerge("b", "merge_2"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_3"); + + internal_iter->AddMerge("c", "merge_4"); + internal_iter->AddMerge("c", "merge_5"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_6"); + internal_iter->AddMerge("b", "merge_7"); + internal_iter->AddMerge("b", "merge_8"); + internal_iter->AddMerge("b", "merge_9"); + internal_iter->AddMerge("b", "merge_10"); + internal_iter->AddMerge("b", "merge_11"); + + internal_iter->AddDeletion("c"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 9, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "merge_6,merge_7"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddPut("b", "val"); + internal_iter->AddMerge("b", "merge_2"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_3"); + + internal_iter->AddMerge("c", "merge_4"); + internal_iter->AddMerge("c", "merge_5"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_6"); + internal_iter->AddMerge("b", "merge_7"); + internal_iter->AddMerge("b", "merge_8"); + internal_iter->AddMerge("b", "merge_9"); + internal_iter->AddMerge("b", "merge_10"); + internal_iter->AddMerge("b", "merge_11"); + + internal_iter->AddDeletion("c"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 13, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), + "merge_6,merge_7,merge_8,merge_9,merge_10,merge_11"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } + + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddPut("b", "val"); + internal_iter->AddMerge("b", "merge_2"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_3"); + + internal_iter->AddMerge("c", "merge_4"); + internal_iter->AddMerge("c", "merge_5"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_6"); + internal_iter->AddMerge("b", "merge_7"); + internal_iter->AddMerge("b", "merge_8"); + internal_iter->AddMerge("b", "merge_9"); + internal_iter->AddMerge("b", "merge_10"); + internal_iter->AddMerge("b", "merge_11"); + + internal_iter->AddDeletion("c"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), + internal_iter, 14, options.max_sequential_skip_in_iterations, + nullptr /*read_callback*/)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), + "merge_6,merge_7,merge_8,merge_9,merge_10,merge_11"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } +} + +TEST_F(DBIteratorTest, DBIterator8) { + ReadOptions ro; + Options options; + options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); + + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddDeletion("a"); + internal_iter->AddPut("a", "0"); + internal_iter->AddPut("b", "0"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, 10, + options.max_sequential_skip_in_iterations, nullptr /*read_callback*/)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "0"); + + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "0"); +} + +// TODO(3.13): fix the issue of Seek() then Prev() which might not necessary +// return the biggest element smaller than the seek key. +TEST_F(DBIteratorTest, DBIterator9) { + ReadOptions ro; + Options options; + options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("b", "merge_3"); + internal_iter->AddMerge("b", "merge_4"); + internal_iter->AddMerge("d", "merge_5"); + internal_iter->AddMerge("d", "merge_6"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, 10, + options.max_sequential_skip_in_iterations, nullptr /*read_callback*/)); + + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4"); + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "d"); + ASSERT_EQ(db_iter->value().ToString(), "merge_5,merge_6"); + + db_iter->Seek("b"); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2"); + + db_iter->SeekForPrev("b"); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4"); + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "d"); + ASSERT_EQ(db_iter->value().ToString(), "merge_5,merge_6"); + + db_iter->Seek("c"); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "d"); + ASSERT_EQ(db_iter->value().ToString(), "merge_5,merge_6"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4"); + + db_iter->SeekForPrev("c"); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4"); + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "d"); + ASSERT_EQ(db_iter->value().ToString(), "merge_5,merge_6"); + } +} + +// TODO(3.13): fix the issue of Seek() then Prev() which might not necessary +// return the biggest element smaller than the seek key. +TEST_F(DBIteratorTest, DBIterator10) { + ReadOptions ro; + Options options; + + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "1"); + internal_iter->AddPut("b", "2"); + internal_iter->AddPut("c", "3"); + internal_iter->AddPut("d", "4"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, 10, + options.max_sequential_skip_in_iterations, nullptr /*read_callback*/)); + + db_iter->Seek("c"); + ASSERT_TRUE(db_iter->Valid()); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "2"); + + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "3"); + + db_iter->SeekForPrev("c"); + ASSERT_TRUE(db_iter->Valid()); + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "d"); + ASSERT_EQ(db_iter->value().ToString(), "4"); + + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "3"); +} + +TEST_F(DBIteratorTest, SeekToLastOccurrenceSeq0) { + ReadOptions ro; + Options options; + options.merge_operator = nullptr; + + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "1"); + internal_iter->AddPut("b", "2"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, 10, 0 /* force seek */, + nullptr /*read_callback*/)); + db_iter->SeekToFirst(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "1"); + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "2"); + db_iter->Next(); + ASSERT_FALSE(db_iter->Valid()); +} + +TEST_F(DBIteratorTest, DBIterator11) { + ReadOptions ro; + Options options; + options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); + + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "0"); + internal_iter->AddPut("b", "0"); + internal_iter->AddSingleDeletion("b"); + internal_iter->AddMerge("a", "1"); + internal_iter->AddMerge("b", "2"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, 1, + options.max_sequential_skip_in_iterations, nullptr /*read_callback*/)); + db_iter->SeekToFirst(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "0"); + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + db_iter->Next(); + ASSERT_FALSE(db_iter->Valid()); +} + +TEST_F(DBIteratorTest, DBIterator12) { + ReadOptions ro; + Options options; + options.merge_operator = nullptr; + + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "1"); + internal_iter->AddPut("b", "2"); + internal_iter->AddPut("c", "3"); + internal_iter->AddSingleDeletion("b"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, 10, 0, nullptr /*read_callback*/)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "3"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "1"); + db_iter->Prev(); + ASSERT_FALSE(db_iter->Valid()); +} + +TEST_F(DBIteratorTest, DBIterator13) { + ReadOptions ro; + Options options; + options.merge_operator = nullptr; + + std::string key; + key.resize(9); + key.assign(9, static_cast(0)); + key[0] = 'b'; + + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut(key, "0"); + internal_iter->AddPut(key, "1"); + internal_iter->AddPut(key, "2"); + internal_iter->AddPut(key, "3"); + internal_iter->AddPut(key, "4"); + internal_iter->AddPut(key, "5"); + internal_iter->AddPut(key, "6"); + internal_iter->AddPut(key, "7"); + internal_iter->AddPut(key, "8"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, 2, 3, nullptr /*read_callback*/)); + db_iter->Seek("b"); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), key); + ASSERT_EQ(db_iter->value().ToString(), "2"); +} + +TEST_F(DBIteratorTest, DBIterator14) { + ReadOptions ro; + Options options; + options.merge_operator = nullptr; + + std::string key("b"); + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("b", "0"); + internal_iter->AddPut("b", "1"); + internal_iter->AddPut("b", "2"); + internal_iter->AddPut("b", "3"); + internal_iter->AddPut("a", "4"); + internal_iter->AddPut("a", "5"); + internal_iter->AddPut("a", "6"); + internal_iter->AddPut("c", "7"); + internal_iter->AddPut("c", "8"); + internal_iter->AddPut("c", "9"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, 4, 1, nullptr /*read_callback*/)); + db_iter->Seek("b"); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "3"); + db_iter->SeekToFirst(); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "4"); +} + +TEST_F(DBIteratorTest, DBIteratorTestDifferentialSnapshots) { + { // test that KVs earlier that iter_start_seqnum are filtered out + ReadOptions ro; + ro.iter_start_seqnum=5; + Options options; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + for (size_t i = 0; i < 10; ++i) { + internal_iter->AddPut(std::to_string(i), std::to_string(i) + "a"); + internal_iter->AddPut(std::to_string(i), std::to_string(i) + "b"); + internal_iter->AddPut(std::to_string(i), std::to_string(i) + "c"); + } + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, 13, + options.max_sequential_skip_in_iterations, nullptr)); + // Expecting InternalKeys in [5,8] range with correct type + int seqnums[4] = {5,8,11,13}; + std::string user_keys[4] = {"1","2","3","4"}; + std::string values[4] = {"1c", "2c", "3c", "4b"}; + int i = 0; + for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) { + FullKey fkey; + ParseFullKey(db_iter->key(), &fkey); + ASSERT_EQ(user_keys[i], fkey.user_key.ToString()); + ASSERT_EQ(EntryType::kEntryPut, fkey.type); + ASSERT_EQ(seqnums[i], fkey.sequence); + ASSERT_EQ(values[i], db_iter->value().ToString()); + i++; + } + ASSERT_EQ(i, 4); + } + + { // Test that deletes are returned correctly as internal KVs + ReadOptions ro; + ro.iter_start_seqnum=5; + Options options; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + for (size_t i = 0; i < 10; ++i) { + internal_iter->AddPut(std::to_string(i), std::to_string(i) + "a"); + internal_iter->AddPut(std::to_string(i), std::to_string(i) + "b"); + internal_iter->AddDeletion(std::to_string(i)); + } + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, 13, + options.max_sequential_skip_in_iterations, nullptr)); + // Expecting InternalKeys in [5,8] range with correct type + int seqnums[4] = {5,8,11,13}; + EntryType key_types[4] = {EntryType::kEntryDelete,EntryType::kEntryDelete, + EntryType::kEntryDelete,EntryType::kEntryPut}; + std::string user_keys[4] = {"1","2","3","4"}; + std::string values[4] = {"", "", "", "4b"}; + int i = 0; + for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) { + FullKey fkey; + ParseFullKey(db_iter->key(), &fkey); + ASSERT_EQ(user_keys[i], fkey.user_key.ToString()); + ASSERT_EQ(key_types[i], fkey.type); + ASSERT_EQ(seqnums[i], fkey.sequence); + ASSERT_EQ(values[i], db_iter->value().ToString()); + i++; + } + ASSERT_EQ(i, 4); + } +} + +class DBIterWithMergeIterTest : public testing::Test { + public: + DBIterWithMergeIterTest() + : env_(Env::Default()), icomp_(BytewiseComparator()) { + options_.merge_operator = nullptr; + + internal_iter1_ = new TestIterator(BytewiseComparator()); + internal_iter1_->Add("a", kTypeValue, "1", 3u); + internal_iter1_->Add("f", kTypeValue, "2", 5u); + internal_iter1_->Add("g", kTypeValue, "3", 7u); + internal_iter1_->Finish(); + + internal_iter2_ = new TestIterator(BytewiseComparator()); + internal_iter2_->Add("a", kTypeValue, "4", 6u); + internal_iter2_->Add("b", kTypeValue, "5", 1u); + internal_iter2_->Add("c", kTypeValue, "6", 2u); + internal_iter2_->Add("d", kTypeValue, "7", 3u); + internal_iter2_->Finish(); + + std::vector child_iters; + child_iters.push_back(internal_iter1_); + child_iters.push_back(internal_iter2_); + InternalKeyComparator icomp(BytewiseComparator()); + InternalIterator* merge_iter = + NewMergingIterator(&icomp_, &child_iters[0], 2u); + + db_iter_.reset(NewDBIterator( + env_, ro_, ImmutableCFOptions(options_), MutableCFOptions(options_), + BytewiseComparator(), merge_iter, + 8 /* read data earlier than seqId 8 */, + 3 /* max iterators before reseek */, nullptr /*read_callback*/)); + } + + Env* env_; + ReadOptions ro_; + Options options_; + TestIterator* internal_iter1_; + TestIterator* internal_iter2_; + InternalKeyComparator icomp_; + Iterator* merge_iter_; + std::unique_ptr db_iter_; +}; + +TEST_F(DBIterWithMergeIterTest, InnerMergeIterator1) { + db_iter_->SeekToFirst(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "a"); + ASSERT_EQ(db_iter_->value().ToString(), "4"); + db_iter_->Next(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "b"); + ASSERT_EQ(db_iter_->value().ToString(), "5"); + db_iter_->Next(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "c"); + ASSERT_EQ(db_iter_->value().ToString(), "6"); + db_iter_->Next(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "d"); + ASSERT_EQ(db_iter_->value().ToString(), "7"); + db_iter_->Next(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "f"); + ASSERT_EQ(db_iter_->value().ToString(), "2"); + db_iter_->Next(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "g"); + ASSERT_EQ(db_iter_->value().ToString(), "3"); + db_iter_->Next(); + ASSERT_FALSE(db_iter_->Valid()); +} + +TEST_F(DBIterWithMergeIterTest, InnerMergeIterator2) { + // Test Prev() when one child iterator is at its end. + db_iter_->SeekForPrev("g"); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "g"); + ASSERT_EQ(db_iter_->value().ToString(), "3"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "f"); + ASSERT_EQ(db_iter_->value().ToString(), "2"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "d"); + ASSERT_EQ(db_iter_->value().ToString(), "7"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "c"); + ASSERT_EQ(db_iter_->value().ToString(), "6"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "b"); + ASSERT_EQ(db_iter_->value().ToString(), "5"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "a"); + ASSERT_EQ(db_iter_->value().ToString(), "4"); +} + +TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace1) { + // Test Prev() when one child iterator is at its end but more rows + // are added. + db_iter_->Seek("f"); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "f"); + ASSERT_EQ(db_iter_->value().ToString(), "2"); + + // Test call back inserts a key in the end of the mem table after + // MergeIterator::Prev() realized the mem table iterator is at its end + // and before an SeekToLast() is called. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "MergeIterator::Prev:BeforePrev", + [&](void* /*arg*/) { internal_iter2_->Add("z", kTypeValue, "7", 12u); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "d"); + ASSERT_EQ(db_iter_->value().ToString(), "7"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "c"); + ASSERT_EQ(db_iter_->value().ToString(), "6"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "b"); + ASSERT_EQ(db_iter_->value().ToString(), "5"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "a"); + ASSERT_EQ(db_iter_->value().ToString(), "4"); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace2) { + // Test Prev() when one child iterator is at its end but more rows + // are added. + db_iter_->Seek("f"); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "f"); + ASSERT_EQ(db_iter_->value().ToString(), "2"); + + // Test call back inserts entries for update a key in the end of the + // mem table after MergeIterator::Prev() realized the mem tableiterator is at + // its end and before an SeekToLast() is called. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "MergeIterator::Prev:BeforePrev", [&](void* /*arg*/) { + internal_iter2_->Add("z", kTypeValue, "7", 12u); + internal_iter2_->Add("z", kTypeValue, "7", 11u); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "d"); + ASSERT_EQ(db_iter_->value().ToString(), "7"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "c"); + ASSERT_EQ(db_iter_->value().ToString(), "6"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "b"); + ASSERT_EQ(db_iter_->value().ToString(), "5"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "a"); + ASSERT_EQ(db_iter_->value().ToString(), "4"); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace3) { + // Test Prev() when one child iterator is at its end but more rows + // are added and max_skipped is triggered. + db_iter_->Seek("f"); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "f"); + ASSERT_EQ(db_iter_->value().ToString(), "2"); + + // Test call back inserts entries for update a key in the end of the + // mem table after MergeIterator::Prev() realized the mem table iterator is at + // its end and before an SeekToLast() is called. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "MergeIterator::Prev:BeforePrev", [&](void* /*arg*/) { + internal_iter2_->Add("z", kTypeValue, "7", 16u, true); + internal_iter2_->Add("z", kTypeValue, "7", 15u, true); + internal_iter2_->Add("z", kTypeValue, "7", 14u, true); + internal_iter2_->Add("z", kTypeValue, "7", 13u, true); + internal_iter2_->Add("z", kTypeValue, "7", 12u, true); + internal_iter2_->Add("z", kTypeValue, "7", 11u, true); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "d"); + ASSERT_EQ(db_iter_->value().ToString(), "7"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "c"); + ASSERT_EQ(db_iter_->value().ToString(), "6"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "b"); + ASSERT_EQ(db_iter_->value().ToString(), "5"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "a"); + ASSERT_EQ(db_iter_->value().ToString(), "4"); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace4) { + // Test Prev() when one child iterator has more rows inserted + // between Seek() and Prev() when changing directions. + internal_iter2_->Add("z", kTypeValue, "9", 4u); + + db_iter_->Seek("g"); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "g"); + ASSERT_EQ(db_iter_->value().ToString(), "3"); + + // Test call back inserts entries for update a key before "z" in + // mem table after MergeIterator::Prev() calls mem table iterator's + // Seek() and before calling Prev() + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "MergeIterator::Prev:BeforePrev", [&](void* arg) { + IteratorWrapper* it = reinterpret_cast(arg); + if (it->key().starts_with("z")) { + internal_iter2_->Add("x", kTypeValue, "7", 16u, true); + internal_iter2_->Add("x", kTypeValue, "7", 15u, true); + internal_iter2_->Add("x", kTypeValue, "7", 14u, true); + internal_iter2_->Add("x", kTypeValue, "7", 13u, true); + internal_iter2_->Add("x", kTypeValue, "7", 12u, true); + internal_iter2_->Add("x", kTypeValue, "7", 11u, true); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "f"); + ASSERT_EQ(db_iter_->value().ToString(), "2"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "d"); + ASSERT_EQ(db_iter_->value().ToString(), "7"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "c"); + ASSERT_EQ(db_iter_->value().ToString(), "6"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "b"); + ASSERT_EQ(db_iter_->value().ToString(), "5"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "a"); + ASSERT_EQ(db_iter_->value().ToString(), "4"); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace5) { + internal_iter2_->Add("z", kTypeValue, "9", 4u); + + // Test Prev() when one child iterator has more rows inserted + // between Seek() and Prev() when changing directions. + db_iter_->Seek("g"); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "g"); + ASSERT_EQ(db_iter_->value().ToString(), "3"); + + // Test call back inserts entries for update a key before "z" in + // mem table after MergeIterator::Prev() calls mem table iterator's + // Seek() and before calling Prev() + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "MergeIterator::Prev:BeforePrev", [&](void* arg) { + IteratorWrapper* it = reinterpret_cast(arg); + if (it->key().starts_with("z")) { + internal_iter2_->Add("x", kTypeValue, "7", 16u, true); + internal_iter2_->Add("x", kTypeValue, "7", 15u, true); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "f"); + ASSERT_EQ(db_iter_->value().ToString(), "2"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "d"); + ASSERT_EQ(db_iter_->value().ToString(), "7"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "c"); + ASSERT_EQ(db_iter_->value().ToString(), "6"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "b"); + ASSERT_EQ(db_iter_->value().ToString(), "5"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "a"); + ASSERT_EQ(db_iter_->value().ToString(), "4"); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace6) { + internal_iter2_->Add("z", kTypeValue, "9", 4u); + + // Test Prev() when one child iterator has more rows inserted + // between Seek() and Prev() when changing directions. + db_iter_->Seek("g"); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "g"); + ASSERT_EQ(db_iter_->value().ToString(), "3"); + + // Test call back inserts an entry for update a key before "z" in + // mem table after MergeIterator::Prev() calls mem table iterator's + // Seek() and before calling Prev() + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "MergeIterator::Prev:BeforePrev", [&](void* arg) { + IteratorWrapper* it = reinterpret_cast(arg); + if (it->key().starts_with("z")) { + internal_iter2_->Add("x", kTypeValue, "7", 16u, true); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "f"); + ASSERT_EQ(db_iter_->value().ToString(), "2"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "d"); + ASSERT_EQ(db_iter_->value().ToString(), "7"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "c"); + ASSERT_EQ(db_iter_->value().ToString(), "6"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "b"); + ASSERT_EQ(db_iter_->value().ToString(), "5"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "a"); + ASSERT_EQ(db_iter_->value().ToString(), "4"); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace7) { + internal_iter1_->Add("u", kTypeValue, "10", 4u); + internal_iter1_->Add("v", kTypeValue, "11", 4u); + internal_iter1_->Add("w", kTypeValue, "12", 4u); + internal_iter2_->Add("z", kTypeValue, "9", 4u); + + // Test Prev() when one child iterator has more rows inserted + // between Seek() and Prev() when changing directions. + db_iter_->Seek("g"); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "g"); + ASSERT_EQ(db_iter_->value().ToString(), "3"); + + // Test call back inserts entries for update a key before "z" in + // mem table after MergeIterator::Prev() calls mem table iterator's + // Seek() and before calling Prev() + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "MergeIterator::Prev:BeforePrev", [&](void* arg) { + IteratorWrapper* it = reinterpret_cast(arg); + if (it->key().starts_with("z")) { + internal_iter2_->Add("x", kTypeValue, "7", 16u, true); + internal_iter2_->Add("x", kTypeValue, "7", 15u, true); + internal_iter2_->Add("x", kTypeValue, "7", 14u, true); + internal_iter2_->Add("x", kTypeValue, "7", 13u, true); + internal_iter2_->Add("x", kTypeValue, "7", 12u, true); + internal_iter2_->Add("x", kTypeValue, "7", 11u, true); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "f"); + ASSERT_EQ(db_iter_->value().ToString(), "2"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "d"); + ASSERT_EQ(db_iter_->value().ToString(), "7"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "c"); + ASSERT_EQ(db_iter_->value().ToString(), "6"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "b"); + ASSERT_EQ(db_iter_->value().ToString(), "5"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "a"); + ASSERT_EQ(db_iter_->value().ToString(), "4"); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace8) { + // internal_iter1_: a, f, g + // internal_iter2_: a, b, c, d, adding (z) + internal_iter2_->Add("z", kTypeValue, "9", 4u); + + // Test Prev() when one child iterator has more rows inserted + // between Seek() and Prev() when changing directions. + db_iter_->Seek("g"); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "g"); + ASSERT_EQ(db_iter_->value().ToString(), "3"); + + // Test call back inserts two keys before "z" in mem table after + // MergeIterator::Prev() calls mem table iterator's Seek() and + // before calling Prev() + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "MergeIterator::Prev:BeforePrev", [&](void* arg) { + IteratorWrapper* it = reinterpret_cast(arg); + if (it->key().starts_with("z")) { + internal_iter2_->Add("x", kTypeValue, "7", 16u, true); + internal_iter2_->Add("y", kTypeValue, "7", 17u, true); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "f"); + ASSERT_EQ(db_iter_->value().ToString(), "2"); + db_iter_->Prev(); + ASSERT_TRUE(db_iter_->Valid()); + ASSERT_EQ(db_iter_->key().ToString(), "d"); + ASSERT_EQ(db_iter_->value().ToString(), "7"); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + + +TEST_F(DBIteratorTest, SeekPrefixTombstones) { + ReadOptions ro; + Options options; + options.prefix_extractor.reset(NewNoopTransform()); + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddDeletion("b"); + internal_iter->AddDeletion("c"); + internal_iter->AddDeletion("d"); + internal_iter->AddDeletion("e"); + internal_iter->AddDeletion("f"); + internal_iter->AddDeletion("g"); + internal_iter->Finish(); + + ro.prefix_same_as_start = true; + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, 10, + options.max_sequential_skip_in_iterations, nullptr /*read_callback*/)); + + int skipped_keys = 0; + + get_perf_context()->Reset(); + db_iter->SeekForPrev("z"); + skipped_keys = + static_cast(get_perf_context()->internal_key_skipped_count); + ASSERT_EQ(skipped_keys, 0); + + get_perf_context()->Reset(); + db_iter->Seek("a"); + skipped_keys = + static_cast(get_perf_context()->internal_key_skipped_count); + ASSERT_EQ(skipped_keys, 0); +} + +TEST_F(DBIteratorTest, SeekToFirstLowerBound) { + const int kNumKeys = 3; + for (int i = 0; i < kNumKeys + 2; ++i) { + // + 2 for two special cases: lower bound before and lower bound after the + // internal iterator's keys + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + for (int j = 1; j <= kNumKeys; ++j) { + internal_iter->AddPut(std::to_string(j), "val"); + } + internal_iter->Finish(); + + ReadOptions ro; + auto lower_bound_str = std::to_string(i); + Slice lower_bound(lower_bound_str); + ro.iterate_lower_bound = &lower_bound; + Options options; + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); + + db_iter->SeekToFirst(); + if (i == kNumKeys + 1) { + // lower bound was beyond the last key + ASSERT_FALSE(db_iter->Valid()); + } else { + ASSERT_TRUE(db_iter->Valid()); + int expected; + if (i == 0) { + // lower bound was before the first key + expected = 1; + } else { + // lower bound was at the ith key + expected = i; + } + ASSERT_EQ(std::to_string(expected), db_iter->key().ToString()); + } + } +} + +TEST_F(DBIteratorTest, PrevLowerBound) { + const int kNumKeys = 3; + const int kLowerBound = 2; + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + for (int j = 1; j <= kNumKeys; ++j) { + internal_iter->AddPut(std::to_string(j), "val"); + } + internal_iter->Finish(); + + ReadOptions ro; + auto lower_bound_str = std::to_string(kLowerBound); + Slice lower_bound(lower_bound_str); + ro.iterate_lower_bound = &lower_bound; + Options options; + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, 10 /* sequence */, + options.max_sequential_skip_in_iterations, nullptr /* read_callback */)); + + db_iter->SeekToLast(); + for (int i = kNumKeys; i >= kLowerBound; --i) { + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(std::to_string(i), db_iter->key().ToString()); + db_iter->Prev(); + } + ASSERT_FALSE(db_iter->Valid()); +} + +TEST_F(DBIteratorTest, SeekLessLowerBound) { + const int kNumKeys = 3; + const int kLowerBound = 2; + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + for (int j = 1; j <= kNumKeys; ++j) { + internal_iter->AddPut(std::to_string(j), "val"); + } + internal_iter->Finish(); + + ReadOptions ro; + auto lower_bound_str = std::to_string(kLowerBound); + Slice lower_bound(lower_bound_str); + ro.iterate_lower_bound = &lower_bound; + Options options; + std::unique_ptr db_iter(NewDBIterator( + env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, 10 /* sequence */, + options.max_sequential_skip_in_iterations, nullptr /* read_callback */)); + + auto before_lower_bound_str = std::to_string(kLowerBound - 1); + Slice before_lower_bound(lower_bound_str); + + db_iter->Seek(before_lower_bound); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(lower_bound_str, db_iter->key().ToString()); +} + +TEST_F(DBIteratorTest, ReverseToForwardWithDisappearingKeys) { + Options options; + options.prefix_extractor.reset(NewCappedPrefixTransform(0)); + + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "A"); + internal_iter->AddPut("b", "B"); + for (int i = 0; i < 100; ++i) { + internal_iter->AddPut("c" + ToString(i), ""); + } + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ReadOptions(), ImmutableCFOptions(options), + MutableCFOptions(options), BytewiseComparator(), internal_iter, 10, + options.max_sequential_skip_in_iterations, nullptr /*read_callback*/)); + + db_iter->SeekForPrev("a"); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_OK(db_iter->status()); + ASSERT_EQ("a", db_iter->key().ToString()); + + internal_iter->Vanish("a"); + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_OK(db_iter->status()); + ASSERT_EQ("b", db_iter->key().ToString()); + + // A (sort of) bug used to cause DBIter to pointlessly drag the internal + // iterator all the way to the end. But this doesn't really matter at the time + // of writing because the only iterator that can see disappearing keys is + // ForwardIterator, which doesn't support SeekForPrev(). + EXPECT_LT(internal_iter->steps(), 20); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_iterator_test.cc b/src/rocksdb/db/db_iterator_test.cc new file mode 100644 index 000000000..99ffb5ce4 --- /dev/null +++ b/src/rocksdb/db/db_iterator_test.cc @@ -0,0 +1,2998 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include + +#include "db/arena_wrapped_db_iter.h" +#include "db/db_iter.h" +#include "db/db_test_util.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/iostats_context.h" +#include "rocksdb/perf_context.h" +#include "table/block_based/flush_block_policy.h" + +namespace ROCKSDB_NAMESPACE { + +// A dumb ReadCallback which saying every key is committed. +class DummyReadCallback : public ReadCallback { + public: + DummyReadCallback() : ReadCallback(kMaxSequenceNumber) {} + bool IsVisibleFullCheck(SequenceNumber /*seq*/) override { return true; } + void SetSnapshot(SequenceNumber seq) { max_visible_seq_ = seq; } +}; + +// Test param: +// bool: whether to pass read_callback to NewIterator(). +class DBIteratorTest : public DBTestBase, + public testing::WithParamInterface { + public: + DBIteratorTest() : DBTestBase("/db_iterator_test") {} + + Iterator* NewIterator(const ReadOptions& read_options, + ColumnFamilyHandle* column_family = nullptr) { + if (column_family == nullptr) { + column_family = db_->DefaultColumnFamily(); + } + auto* cfd = reinterpret_cast(column_family)->cfd(); + SequenceNumber seq = read_options.snapshot != nullptr + ? read_options.snapshot->GetSequenceNumber() + : db_->GetLatestSequenceNumber(); + bool use_read_callback = GetParam(); + DummyReadCallback* read_callback = nullptr; + if (use_read_callback) { + read_callback = new DummyReadCallback(); + read_callback->SetSnapshot(seq); + InstrumentedMutexLock lock(&mutex_); + read_callbacks_.push_back( + std::unique_ptr(read_callback)); + } + return dbfull()->NewIteratorImpl(read_options, cfd, seq, read_callback); + } + + private: + InstrumentedMutex mutex_; + std::vector> read_callbacks_; +}; + +TEST_P(DBIteratorTest, IteratorProperty) { + // The test needs to be changed if kPersistedTier is supported in iterator. + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + Put(1, "1", "2"); + Delete(1, "2"); + ReadOptions ropt; + ropt.pin_data = false; + { + std::unique_ptr iter(NewIterator(ropt, handles_[1])); + iter->SeekToFirst(); + std::string prop_value; + ASSERT_NOK(iter->GetProperty("non_existing.value", &prop_value)); + ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("0", prop_value); + ASSERT_OK(iter->GetProperty("rocksdb.iterator.internal-key", &prop_value)); + ASSERT_EQ("1", prop_value); + iter->Next(); + ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("Iterator is not valid.", prop_value); + + // Get internal key at which the iteration stopped (tombstone in this case). + ASSERT_OK(iter->GetProperty("rocksdb.iterator.internal-key", &prop_value)); + ASSERT_EQ("2", prop_value); + } + Close(); +} + +TEST_P(DBIteratorTest, PersistedTierOnIterator) { + // The test needs to be changed if kPersistedTier is supported in iterator. + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + ReadOptions ropt; + ropt.read_tier = kPersistedTier; + + auto* iter = db_->NewIterator(ropt, handles_[1]); + ASSERT_TRUE(iter->status().IsNotSupported()); + delete iter; + + std::vector iters; + ASSERT_TRUE(db_->NewIterators(ropt, {handles_[1]}, &iters).IsNotSupported()); + Close(); +} + +TEST_P(DBIteratorTest, NonBlockingIteration) { + do { + ReadOptions non_blocking_opts, regular_opts; + Options options = CurrentOptions(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + non_blocking_opts.read_tier = kBlockCacheTier; + CreateAndReopenWithCF({"pikachu"}, options); + // write one kv to the database. + ASSERT_OK(Put(1, "a", "b")); + + // scan using non-blocking iterator. We should find it because + // it is in memtable. + Iterator* iter = NewIterator(non_blocking_opts, handles_[1]); + int count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + count++; + } + ASSERT_EQ(count, 1); + delete iter; + + // flush memtable to storage. Now, the key should not be in the + // memtable neither in the block cache. + ASSERT_OK(Flush(1)); + + // verify that a non-blocking iterator does not find any + // kvs. Neither does it do any IOs to storage. + uint64_t numopen = TestGetTickerCount(options, NO_FILE_OPENS); + uint64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + iter = NewIterator(non_blocking_opts, handles_[1]); + count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + count++; + } + ASSERT_EQ(count, 0); + ASSERT_TRUE(iter->status().IsIncomplete()); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + delete iter; + + // read in the specified block via a regular get + ASSERT_EQ(Get(1, "a"), "b"); + + // verify that we can find it via a non-blocking scan + numopen = TestGetTickerCount(options, NO_FILE_OPENS); + cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + iter = NewIterator(non_blocking_opts, handles_[1]); + count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + count++; + } + ASSERT_EQ(count, 1); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + delete iter; + + // This test verifies block cache behaviors, which is not used by plain + // table format. + } while (ChangeOptions(kSkipPlainTable | kSkipNoSeekToLast | kSkipMmapReads)); +} + +TEST_P(DBIteratorTest, IterSeekBeforePrev) { + ASSERT_OK(Put("a", "b")); + ASSERT_OK(Put("c", "d")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("0", "f")); + ASSERT_OK(Put("1", "h")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("2", "j")); + auto iter = NewIterator(ReadOptions()); + iter->Seek(Slice("c")); + iter->Prev(); + iter->Seek(Slice("a")); + iter->Prev(); + delete iter; +} + +TEST_P(DBIteratorTest, IterReseekNewUpperBound) { + Random rnd(301); + Options options = CurrentOptions(); + BlockBasedTableOptions table_options; + table_options.block_size = 1024; + table_options.block_size_deviation = 50; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.compression = kNoCompression; + Reopen(options); + + ASSERT_OK(Put("a", RandomString(&rnd, 400))); + ASSERT_OK(Put("aabb", RandomString(&rnd, 400))); + ASSERT_OK(Put("aaef", RandomString(&rnd, 400))); + ASSERT_OK(Put("b", RandomString(&rnd, 400))); + dbfull()->Flush(FlushOptions()); + ReadOptions opts; + Slice ub = Slice("aa"); + opts.iterate_upper_bound = &ub; + auto iter = NewIterator(opts); + iter->Seek(Slice("a")); + ub = Slice("b"); + iter->Seek(Slice("aabc")); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), "aaef"); + delete iter; +} + +TEST_P(DBIteratorTest, IterSeekForPrevBeforeNext) { + ASSERT_OK(Put("a", "b")); + ASSERT_OK(Put("c", "d")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("0", "f")); + ASSERT_OK(Put("1", "h")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("2", "j")); + auto iter = NewIterator(ReadOptions()); + iter->SeekForPrev(Slice("0")); + iter->Next(); + iter->SeekForPrev(Slice("1")); + iter->Next(); + delete iter; +} + +namespace { +std::string MakeLongKey(size_t length, char c) { + return std::string(length, c); +} +} // namespace + +TEST_P(DBIteratorTest, IterLongKeys) { + ASSERT_OK(Put(MakeLongKey(20, 0), "0")); + ASSERT_OK(Put(MakeLongKey(32, 2), "2")); + ASSERT_OK(Put("a", "b")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put(MakeLongKey(50, 1), "1")); + ASSERT_OK(Put(MakeLongKey(127, 3), "3")); + ASSERT_OK(Put(MakeLongKey(64, 4), "4")); + auto iter = NewIterator(ReadOptions()); + + // Create a key that needs to be skipped for Seq too new + iter->Seek(MakeLongKey(20, 0)); + ASSERT_EQ(IterStatus(iter), MakeLongKey(20, 0) + "->0"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(64, 4) + "->4"); + + iter->SeekForPrev(MakeLongKey(127, 3)); + ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1"); + delete iter; + + iter = NewIterator(ReadOptions()); + iter->Seek(MakeLongKey(50, 1)); + ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3"); + delete iter; +} + +TEST_P(DBIteratorTest, IterNextWithNewerSeq) { + ASSERT_OK(Put("0", "0")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("a", "b")); + ASSERT_OK(Put("c", "d")); + ASSERT_OK(Put("d", "e")); + auto iter = NewIterator(ReadOptions()); + + // Create a key that needs to be skipped for Seq too new + for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1; + i++) { + ASSERT_OK(Put("b", "f")); + } + + iter->Seek(Slice("a")); + ASSERT_EQ(IterStatus(iter), "a->b"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->d"); + iter->SeekForPrev(Slice("b")); + ASSERT_EQ(IterStatus(iter), "a->b"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->d"); + + delete iter; +} + +TEST_P(DBIteratorTest, IterPrevWithNewerSeq) { + ASSERT_OK(Put("0", "0")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("a", "b")); + ASSERT_OK(Put("c", "d")); + ASSERT_OK(Put("d", "e")); + auto iter = NewIterator(ReadOptions()); + + // Create a key that needs to be skipped for Seq too new + for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1; + i++) { + ASSERT_OK(Put("b", "f")); + } + + iter->Seek(Slice("d")); + ASSERT_EQ(IterStatus(iter), "d->e"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "c->d"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->b"); + iter->Prev(); + iter->SeekForPrev(Slice("d")); + ASSERT_EQ(IterStatus(iter), "d->e"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "c->d"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->b"); + iter->Prev(); + delete iter; +} + +TEST_P(DBIteratorTest, IterPrevWithNewerSeq2) { + ASSERT_OK(Put("0", "0")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("a", "b")); + ASSERT_OK(Put("c", "d")); + ASSERT_OK(Put("e", "f")); + auto iter = NewIterator(ReadOptions()); + auto iter2 = NewIterator(ReadOptions()); + iter->Seek(Slice("c")); + iter2->SeekForPrev(Slice("d")); + ASSERT_EQ(IterStatus(iter), "c->d"); + ASSERT_EQ(IterStatus(iter2), "c->d"); + + // Create a key that needs to be skipped for Seq too new + for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1; + i++) { + ASSERT_OK(Put("b", "f")); + } + + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->b"); + iter->Prev(); + iter2->Prev(); + ASSERT_EQ(IterStatus(iter2), "a->b"); + iter2->Prev(); + delete iter; + delete iter2; +} + +TEST_P(DBIteratorTest, IterEmpty) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + Iterator* iter = NewIterator(ReadOptions(), handles_[1]); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek("foo"); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekForPrev("foo"); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; + } while (ChangeCompactOptions()); +} + +TEST_P(DBIteratorTest, IterSingle) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "a", "va")); + Iterator* iter = NewIterator(ReadOptions(), handles_[1]); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek(""); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekForPrev(""); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek("a"); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekForPrev("a"); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek("b"); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekForPrev("b"); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; + } while (ChangeCompactOptions()); +} + +TEST_P(DBIteratorTest, IterMulti) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "a", "va")); + ASSERT_OK(Put(1, "b", "vb")); + ASSERT_OK(Put(1, "c", "vc")); + Iterator* iter = NewIterator(ReadOptions(), handles_[1]); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek(""); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Seek("a"); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Seek("ax"); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->SeekForPrev("d"); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->SeekForPrev("c"); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->SeekForPrev("bx"); + ASSERT_EQ(IterStatus(iter), "b->vb"); + + iter->Seek("b"); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Seek("z"); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekForPrev("b"); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->SeekForPrev(""); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + // Switch from reverse to forward + iter->SeekToLast(); + iter->Prev(); + iter->Prev(); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + + // Switch from forward to reverse + iter->SeekToFirst(); + iter->Next(); + iter->Next(); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + + // Make sure iter stays at snapshot + ASSERT_OK(Put(1, "a", "va2")); + ASSERT_OK(Put(1, "a2", "va3")); + ASSERT_OK(Put(1, "b", "vb2")); + ASSERT_OK(Put(1, "c", "vc2")); + ASSERT_OK(Delete(1, "b")); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; + } while (ChangeCompactOptions()); +} + +// Check that we can skip over a run of user keys +// by using reseek rather than sequential scan +TEST_P(DBIteratorTest, IterReseek) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; + Options options = CurrentOptions(options_override); + options.max_sequential_skip_in_iterations = 3; + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // insert three keys with same userkey and verify that + // reseek is not invoked. For each of these test cases, + // verify that we can find the next key "b". + ASSERT_OK(Put(1, "a", "zero")); + ASSERT_OK(Put(1, "a", "one")); + ASSERT_OK(Put(1, "a", "two")); + ASSERT_OK(Put(1, "b", "bone")); + Iterator* iter = NewIterator(ReadOptions(), handles_[1]); + iter->SeekToFirst(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); + ASSERT_EQ(IterStatus(iter), "a->two"); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); + ASSERT_EQ(IterStatus(iter), "b->bone"); + delete iter; + + // insert a total of three keys with same userkey and verify + // that reseek is still not invoked. + ASSERT_OK(Put(1, "a", "three")); + iter = NewIterator(ReadOptions(), handles_[1]); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->three"); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); + ASSERT_EQ(IterStatus(iter), "b->bone"); + delete iter; + + // insert a total of four keys with same userkey and verify + // that reseek is invoked. + ASSERT_OK(Put(1, "a", "four")); + iter = NewIterator(ReadOptions(), handles_[1]); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->four"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1); + ASSERT_EQ(IterStatus(iter), "b->bone"); + delete iter; + + // Testing reverse iterator + // At this point, we have three versions of "a" and one version of "b". + // The reseek statistics is already at 1. + int num_reseeks = static_cast( + TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION)); + + // Insert another version of b and assert that reseek is not invoked + ASSERT_OK(Put(1, "b", "btwo")); + iter = NewIterator(ReadOptions(), handles_[1]); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "b->btwo"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), + num_reseeks); + iter->Prev(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), + num_reseeks + 1); + ASSERT_EQ(IterStatus(iter), "a->four"); + delete iter; + + // insert two more versions of b. This makes a total of 4 versions + // of b and 4 versions of a. + ASSERT_OK(Put(1, "b", "bthree")); + ASSERT_OK(Put(1, "b", "bfour")); + iter = NewIterator(ReadOptions(), handles_[1]); + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "b->bfour"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), + num_reseeks + 2); + iter->Prev(); + + // the previous Prev call should have invoked reseek + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), + num_reseeks + 3); + ASSERT_EQ(IterStatus(iter), "a->four"); + delete iter; +} + +TEST_P(DBIteratorTest, IterSmallAndLargeMix) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "a", "va")); + ASSERT_OK(Put(1, "b", std::string(100000, 'b'))); + ASSERT_OK(Put(1, "c", "vc")); + ASSERT_OK(Put(1, "d", std::string(100000, 'd'))); + ASSERT_OK(Put(1, "e", std::string(100000, 'e'))); + + Iterator* iter = NewIterator(ReadOptions(), handles_[1]); + + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b')); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd')); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e')); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e')); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd')); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b')); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + delete iter; + } while (ChangeCompactOptions()); +} + +TEST_P(DBIteratorTest, IterMultiWithDelete) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "ka", "va")); + ASSERT_OK(Put(1, "kb", "vb")); + ASSERT_OK(Put(1, "kc", "vc")); + ASSERT_OK(Delete(1, "kb")); + ASSERT_EQ("NOT_FOUND", Get(1, "kb")); + + Iterator* iter = NewIterator(ReadOptions(), handles_[1]); + iter->Seek("kc"); + ASSERT_EQ(IterStatus(iter), "kc->vc"); + if (!CurrentOptions().merge_operator) { + // TODO: merge operator does not support backward iteration yet + if (kPlainTableAllBytesPrefix != option_config_ && + kBlockBasedTableWithWholeKeyHashIndex != option_config_ && + kHashLinkList != option_config_ && + kHashSkipList != option_config_) { // doesn't support SeekToLast + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "ka->va"); + } + } + delete iter; + } while (ChangeOptions()); +} + +TEST_P(DBIteratorTest, IterPrevMaxSkip) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + for (int i = 0; i < 2; i++) { + ASSERT_OK(Put(1, "key1", "v1")); + ASSERT_OK(Put(1, "key2", "v2")); + ASSERT_OK(Put(1, "key3", "v3")); + ASSERT_OK(Put(1, "key4", "v4")); + ASSERT_OK(Put(1, "key5", "v5")); + } + + VerifyIterLast("key5->v5", 1); + + ASSERT_OK(Delete(1, "key5")); + VerifyIterLast("key4->v4", 1); + + ASSERT_OK(Delete(1, "key4")); + VerifyIterLast("key3->v3", 1); + + ASSERT_OK(Delete(1, "key3")); + VerifyIterLast("key2->v2", 1); + + ASSERT_OK(Delete(1, "key2")); + VerifyIterLast("key1->v1", 1); + + ASSERT_OK(Delete(1, "key1")); + VerifyIterLast("(invalid)", 1); + } while (ChangeOptions(kSkipMergePut | kSkipNoSeekToLast)); +} + +TEST_P(DBIteratorTest, IterWithSnapshot) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override)); + ASSERT_OK(Put(1, "key1", "val1")); + ASSERT_OK(Put(1, "key2", "val2")); + ASSERT_OK(Put(1, "key3", "val3")); + ASSERT_OK(Put(1, "key4", "val4")); + ASSERT_OK(Put(1, "key5", "val5")); + + const Snapshot* snapshot = db_->GetSnapshot(); + ReadOptions options; + options.snapshot = snapshot; + Iterator* iter = NewIterator(options, handles_[1]); + + ASSERT_OK(Put(1, "key0", "val0")); + // Put more values after the snapshot + ASSERT_OK(Put(1, "key100", "val100")); + ASSERT_OK(Put(1, "key101", "val101")); + + iter->Seek("key5"); + ASSERT_EQ(IterStatus(iter), "key5->val5"); + if (!CurrentOptions().merge_operator) { + // TODO: merge operator does not support backward iteration yet + if (kPlainTableAllBytesPrefix != option_config_ && + kBlockBasedTableWithWholeKeyHashIndex != option_config_ && + kHashLinkList != option_config_ && kHashSkipList != option_config_) { + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "key4->val4"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "key3->val3"); + + iter->Next(); + ASSERT_EQ(IterStatus(iter), "key4->val4"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "key5->val5"); + } + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + } + + if (!CurrentOptions().merge_operator) { + // TODO(gzh): merge operator does not support backward iteration yet + if (kPlainTableAllBytesPrefix != option_config_ && + kBlockBasedTableWithWholeKeyHashIndex != option_config_ && + kHashLinkList != option_config_ && kHashSkipList != option_config_) { + iter->SeekForPrev("key1"); + ASSERT_EQ(IterStatus(iter), "key1->val1"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "key2->val2"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "key3->val3"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "key2->val2"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "key1->val1"); + iter->Prev(); + ASSERT_TRUE(!iter->Valid()); + } + } + db_->ReleaseSnapshot(snapshot); + delete iter; + } while (ChangeOptions()); +} + +TEST_P(DBIteratorTest, IteratorPinsRef) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + Put(1, "foo", "hello"); + + // Get iterator that will yield the current contents of the DB. + Iterator* iter = NewIterator(ReadOptions(), handles_[1]); + + // Write to force compactions + Put(1, "foo", "newvalue1"); + for (int i = 0; i < 100; i++) { + // 100K values + ASSERT_OK(Put(1, Key(i), Key(i) + std::string(100000, 'v'))); + } + Put(1, "foo", "newvalue2"); + + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + ASSERT_EQ("hello", iter->value().ToString()); + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + delete iter; + } while (ChangeCompactOptions()); +} + +TEST_P(DBIteratorTest, IteratorDeleteAfterCfDelete) { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + + Put(1, "foo", "delete-cf-then-delete-iter"); + Put(1, "hello", "value2"); + + ColumnFamilyHandle* cf = handles_[1]; + ReadOptions ro; + + auto* iter = db_->NewIterator(ro, cf); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "foo->delete-cf-then-delete-iter"); + + // delete CF handle + db_->DestroyColumnFamilyHandle(cf); + handles_.erase(std::begin(handles_) + 1); + + // delete Iterator after CF handle is deleted + iter->Next(); + ASSERT_EQ(IterStatus(iter), "hello->value2"); + delete iter; +} + +TEST_P(DBIteratorTest, IteratorDeleteAfterCfDrop) { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + + Put(1, "foo", "drop-cf-then-delete-iter"); + + ReadOptions ro; + ColumnFamilyHandle* cf = handles_[1]; + + auto* iter = db_->NewIterator(ro, cf); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "foo->drop-cf-then-delete-iter"); + + // drop and delete CF + db_->DropColumnFamily(cf); + db_->DestroyColumnFamilyHandle(cf); + handles_.erase(std::begin(handles_) + 1); + + // delete Iterator after CF handle is dropped + delete iter; +} + +// SetOptions not defined in ROCKSDB LITE +#ifndef ROCKSDB_LITE +TEST_P(DBIteratorTest, DBIteratorBoundTest) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + + options.prefix_extractor = nullptr; + DestroyAndReopen(options); + ASSERT_OK(Put("a", "0")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("foo1", "bar1")); + ASSERT_OK(Put("g1", "0")); + + // testing basic case with no iterate_upper_bound and no prefix_extractor + { + ReadOptions ro; + ro.iterate_upper_bound = nullptr; + + std::unique_ptr iter(NewIterator(ro)); + + iter->Seek("foo"); + + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo")), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo1")), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("g1")), 0); + + iter->SeekForPrev("g1"); + + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("g1")), 0); + + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo1")), 0); + + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo")), 0); + } + + // testing iterate_upper_bound and forward iterator + // to make sure it stops at bound + { + ReadOptions ro; + // iterate_upper_bound points beyond the last expected entry + Slice prefix("foo2"); + ro.iterate_upper_bound = &prefix; + + std::unique_ptr iter(NewIterator(ro)); + + iter->Seek("foo"); + + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo")), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(("foo1")), 0); + + iter->Next(); + // should stop here... + ASSERT_TRUE(!iter->Valid()); + } + // Testing SeekToLast with iterate_upper_bound set + { + ReadOptions ro; + + Slice prefix("foo"); + ro.iterate_upper_bound = &prefix; + + std::unique_ptr iter(NewIterator(ro)); + + iter->SeekToLast(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("a")), 0); + } + + // prefix is the first letter of the key + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:1"}})); + ASSERT_OK(Put("a", "0")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("foo1", "bar1")); + ASSERT_OK(Put("g1", "0")); + + // testing with iterate_upper_bound and prefix_extractor + // Seek target and iterate_upper_bound are not is same prefix + // This should be an error + { + ReadOptions ro; + Slice upper_bound("g"); + ro.iterate_upper_bound = &upper_bound; + + std::unique_ptr iter(NewIterator(ro)); + + iter->Seek("foo"); + + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo1", iter->key().ToString()); + + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + } + + // testing that iterate_upper_bound prevents iterating over deleted items + // if the bound has already reached + { + options.prefix_extractor = nullptr; + DestroyAndReopen(options); + ASSERT_OK(Put("a", "0")); + ASSERT_OK(Put("b", "0")); + ASSERT_OK(Put("b1", "0")); + ASSERT_OK(Put("c", "0")); + ASSERT_OK(Put("d", "0")); + ASSERT_OK(Put("e", "0")); + ASSERT_OK(Delete("c")); + ASSERT_OK(Delete("d")); + + // base case with no bound + ReadOptions ro; + ro.iterate_upper_bound = nullptr; + + std::unique_ptr iter(NewIterator(ro)); + + iter->Seek("b"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("b")), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(("b1")), 0); + + get_perf_context()->Reset(); + iter->Next(); + + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(static_cast(get_perf_context()->internal_delete_skipped_count), 2); + + // now testing with iterate_bound + Slice prefix("c"); + ro.iterate_upper_bound = &prefix; + + iter.reset(NewIterator(ro)); + + get_perf_context()->Reset(); + + iter->Seek("b"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("b")), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(("b1")), 0); + + iter->Next(); + // the iteration should stop as soon as the bound key is reached + // even though the key is deleted + // hence internal_delete_skipped_count should be 0 + ASSERT_TRUE(!iter->Valid()); + ASSERT_EQ(static_cast(get_perf_context()->internal_delete_skipped_count), 0); + } +} + +TEST_P(DBIteratorTest, DBIteratorBoundMultiSeek) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.prefix_extractor = nullptr; + DestroyAndReopen(options); + ASSERT_OK(Put("a", "0")); + ASSERT_OK(Put("z", "0")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo1", "bar1")); + ASSERT_OK(Put("foo2", "bar2")); + ASSERT_OK(Put("foo3", "bar3")); + ASSERT_OK(Put("foo4", "bar4")); + + { + std::string up_str = "foo5"; + Slice up(up_str); + ReadOptions ro; + ro.iterate_upper_bound = &up; + std::unique_ptr iter(NewIterator(ro)); + + iter->Seek("foo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo1")), 0); + + uint64_t prev_block_cache_hit = + TestGetTickerCount(options, BLOCK_CACHE_HIT); + uint64_t prev_block_cache_miss = + TestGetTickerCount(options, BLOCK_CACHE_MISS); + + ASSERT_GT(prev_block_cache_hit + prev_block_cache_miss, 0); + + iter->Seek("foo4"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo4")), 0); + ASSERT_EQ(prev_block_cache_hit, + TestGetTickerCount(options, BLOCK_CACHE_HIT)); + ASSERT_EQ(prev_block_cache_miss, + TestGetTickerCount(options, BLOCK_CACHE_MISS)); + + iter->Seek("foo2"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo2")), 0); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo3")), 0); + ASSERT_EQ(prev_block_cache_hit, + TestGetTickerCount(options, BLOCK_CACHE_HIT)); + ASSERT_EQ(prev_block_cache_miss, + TestGetTickerCount(options, BLOCK_CACHE_MISS)); + } +} +#endif + +TEST_P(DBIteratorTest, DBIteratorBoundOptimizationTest) { + for (auto format_version : {2, 3, 4}) { + int upper_bound_hits = 0; + Options options = CurrentOptions(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTableIterator:out_of_bound", + [&upper_bound_hits](void*) { upper_bound_hits++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + options.env = env_; + options.create_if_missing = true; + options.prefix_extractor = nullptr; + BlockBasedTableOptions table_options; + table_options.format_version = format_version; + table_options.flush_block_policy_factory = + std::make_shared(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + DestroyAndReopen(options); + ASSERT_OK(Put("foo1", "bar1")); + ASSERT_OK(Put("foo2", "bar2")); + ASSERT_OK(Put("foo4", "bar4")); + ASSERT_OK(Flush()); + + Slice ub("foo3"); + ReadOptions ro; + ro.iterate_upper_bound = &ub; + + std::unique_ptr iter(NewIterator(ro)); + + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo1")), 0); + ASSERT_EQ(upper_bound_hits, 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo2")), 0); + ASSERT_EQ(upper_bound_hits, 0); + + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_EQ(upper_bound_hits, 1); + } +} + +// Enable kBinarySearchWithFirstKey, do some iterator operations and check that +// they don't do unnecessary block reads. +TEST_P(DBIteratorTest, IndexWithFirstKey) { + for (int tailing = 0; tailing < 2; ++tailing) { + SCOPED_TRACE("tailing = " + std::to_string(tailing)); + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.prefix_extractor = nullptr; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + Statistics* stats = options.statistics.get(); + BlockBasedTableOptions table_options; + table_options.index_type = + BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey; + table_options.index_shortening = + BlockBasedTableOptions::IndexShorteningMode::kNoShortening; + table_options.flush_block_policy_factory = + std::make_shared(); + table_options.block_cache = + NewLRUCache(8000); // fits all blocks and their cache metadata overhead + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + DestroyAndReopen(options); + ASSERT_OK(Merge("a1", "x1")); + ASSERT_OK(Merge("b1", "y1")); + ASSERT_OK(Merge("c0", "z1")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("a2", "x2")); + ASSERT_OK(Merge("b2", "y2")); + ASSERT_OK(Merge("c0", "z2")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("a3", "x3")); + ASSERT_OK(Merge("b3", "y3")); + ASSERT_OK(Merge("c3", "z3")); + ASSERT_OK(Flush()); + + // Block cache is not important for this test. + // We use BLOCK_CACHE_DATA_* counters just because they're the most readily + // available way of counting block accesses. + + ReadOptions ropt; + ropt.tailing = tailing; + std::unique_ptr iter(NewIterator(ropt)); + + iter->Seek("b10"); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ("b2", iter->key().ToString()); + EXPECT_EQ("y2", iter->value().ToString()); + EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ("b3", iter->key().ToString()); + EXPECT_EQ("y3", iter->value().ToString()); + EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + iter->Seek("c0"); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ("c0", iter->key().ToString()); + EXPECT_EQ("z1,z2", iter->value().ToString()); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ("c3", iter->key().ToString()); + EXPECT_EQ("z3", iter->value().ToString()); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ(5, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + + iter.reset(); + + // Enable iterate_upper_bound and check that iterator is not trying to read + // blocks that are fully above upper bound. + std::string ub = "b3"; + Slice ub_slice(ub); + ropt.iterate_upper_bound = &ub_slice; + iter.reset(NewIterator(ropt)); + + iter->Seek("b2"); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ("b2", iter->key().ToString()); + EXPECT_EQ("y2", iter->value().ToString()); + EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ(5, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + + iter->Next(); + ASSERT_FALSE(iter->Valid()); + EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ(5, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + } +} + +TEST_P(DBIteratorTest, IndexWithFirstKeyGet) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.prefix_extractor = nullptr; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + Statistics* stats = options.statistics.get(); + BlockBasedTableOptions table_options; + table_options.index_type = + BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey; + table_options.index_shortening = + BlockBasedTableOptions::IndexShorteningMode::kNoShortening; + table_options.flush_block_policy_factory = + std::make_shared(); + table_options.block_cache = NewLRUCache(1000); // fits all blocks + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + DestroyAndReopen(options); + ASSERT_OK(Merge("a", "x1")); + ASSERT_OK(Merge("c", "y1")); + ASSERT_OK(Merge("e", "z1")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("c", "y2")); + ASSERT_OK(Merge("e", "z2")); + ASSERT_OK(Flush()); + + // Get() between blocks shouldn't read any blocks. + ASSERT_EQ("NOT_FOUND", Get("b")); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + // Get() of an existing key shouldn't read any unnecessary blocks when there's + // only one key per block. + + ASSERT_EQ("y1,y2", Get("c")); + EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + ASSERT_EQ("x1", Get("a")); + EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + EXPECT_EQ(std::vector({"NOT_FOUND", "z1,z2"}), + MultiGet({"b", "e"})); +} + +// TODO(3.13): fix the issue of Seek() + Prev() which might not necessary +// return the biggest key which is smaller than the seek key. +TEST_P(DBIteratorTest, PrevAfterAndNextAfterMerge) { + Options options; + options.create_if_missing = true; + options.merge_operator = MergeOperators::CreatePutOperator(); + options.env = env_; + DestroyAndReopen(options); + + // write three entries with different keys using Merge() + WriteOptions wopts; + db_->Merge(wopts, "1", "data1"); + db_->Merge(wopts, "2", "data2"); + db_->Merge(wopts, "3", "data3"); + + std::unique_ptr it(NewIterator(ReadOptions())); + + it->Seek("2"); + ASSERT_TRUE(it->Valid()); + ASSERT_EQ("2", it->key().ToString()); + + it->Prev(); + ASSERT_TRUE(it->Valid()); + ASSERT_EQ("1", it->key().ToString()); + + it->SeekForPrev("1"); + ASSERT_TRUE(it->Valid()); + ASSERT_EQ("1", it->key().ToString()); + + it->Next(); + ASSERT_TRUE(it->Valid()); + ASSERT_EQ("2", it->key().ToString()); +} + +class DBIteratorTestForPinnedData : public DBIteratorTest { + public: + enum TestConfig { + NORMAL, + CLOSE_AND_OPEN, + COMPACT_BEFORE_READ, + FLUSH_EVERY_1000, + MAX + }; + DBIteratorTestForPinnedData() : DBIteratorTest() {} + void PinnedDataIteratorRandomized(TestConfig run_config) { + // Generate Random data + Random rnd(301); + + int puts = 100000; + int key_pool = static_cast(puts * 0.7); + int key_size = 100; + int val_size = 1000; + int seeks_percentage = 20; // 20% of keys will be used to test seek() + int delete_percentage = 20; // 20% of keys will be deleted + int merge_percentage = 20; // 20% of keys will be added using Merge() + + Options options = CurrentOptions(); + BlockBasedTableOptions table_options; + table_options.use_delta_encoding = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.merge_operator = MergeOperators::CreatePutOperator(); + DestroyAndReopen(options); + + std::vector generated_keys(key_pool); + for (int i = 0; i < key_pool; i++) { + generated_keys[i] = RandomString(&rnd, key_size); + } + + std::map true_data; + std::vector random_keys; + std::vector deleted_keys; + for (int i = 0; i < puts; i++) { + auto& k = generated_keys[rnd.Next() % key_pool]; + auto v = RandomString(&rnd, val_size); + + // Insert data to true_data map and to DB + true_data[k] = v; + if (rnd.PercentTrue(merge_percentage)) { + ASSERT_OK(db_->Merge(WriteOptions(), k, v)); + } else { + ASSERT_OK(Put(k, v)); + } + + // Pick random keys to be used to test Seek() + if (rnd.PercentTrue(seeks_percentage)) { + random_keys.push_back(k); + } + + // Delete some random keys + if (rnd.PercentTrue(delete_percentage)) { + deleted_keys.push_back(k); + true_data.erase(k); + ASSERT_OK(Delete(k)); + } + + if (run_config == TestConfig::FLUSH_EVERY_1000) { + if (i && i % 1000 == 0) { + Flush(); + } + } + } + + if (run_config == TestConfig::CLOSE_AND_OPEN) { + Close(); + Reopen(options); + } else if (run_config == TestConfig::COMPACT_BEFORE_READ) { + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + } + + ReadOptions ro; + ro.pin_data = true; + auto iter = NewIterator(ro); + + { + // Test Seek to random keys + std::vector keys_slices; + std::vector true_keys; + for (auto& k : random_keys) { + iter->Seek(k); + if (!iter->Valid()) { + ASSERT_EQ(true_data.lower_bound(k), true_data.end()); + continue; + } + std::string prop_value; + ASSERT_OK( + iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("1", prop_value); + keys_slices.push_back(iter->key()); + true_keys.push_back(true_data.lower_bound(k)->first); + } + + for (size_t i = 0; i < keys_slices.size(); i++) { + ASSERT_EQ(keys_slices[i].ToString(), true_keys[i]); + } + } + + { + // Test SeekForPrev to random keys + std::vector keys_slices; + std::vector true_keys; + for (auto& k : random_keys) { + iter->SeekForPrev(k); + if (!iter->Valid()) { + ASSERT_EQ(true_data.upper_bound(k), true_data.begin()); + continue; + } + std::string prop_value; + ASSERT_OK( + iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("1", prop_value); + keys_slices.push_back(iter->key()); + true_keys.push_back((--true_data.upper_bound(k))->first); + } + + for (size_t i = 0; i < keys_slices.size(); i++) { + ASSERT_EQ(keys_slices[i].ToString(), true_keys[i]); + } + } + + { + // Test iterating all data forward + std::vector all_keys; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + std::string prop_value; + ASSERT_OK( + iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("1", prop_value); + all_keys.push_back(iter->key()); + } + ASSERT_EQ(all_keys.size(), true_data.size()); + + // Verify that all keys slices are valid + auto data_iter = true_data.begin(); + for (size_t i = 0; i < all_keys.size(); i++) { + ASSERT_EQ(all_keys[i].ToString(), data_iter->first); + data_iter++; + } + } + + { + // Test iterating all data backward + std::vector all_keys; + for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { + std::string prop_value; + ASSERT_OK( + iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("1", prop_value); + all_keys.push_back(iter->key()); + } + ASSERT_EQ(all_keys.size(), true_data.size()); + + // Verify that all keys slices are valid (backward) + auto data_iter = true_data.rbegin(); + for (size_t i = 0; i < all_keys.size(); i++) { + ASSERT_EQ(all_keys[i].ToString(), data_iter->first); + data_iter++; + } + } + + delete iter; +} +}; + +TEST_P(DBIteratorTestForPinnedData, PinnedDataIteratorRandomizedNormal) { + PinnedDataIteratorRandomized(TestConfig::NORMAL); +} + +TEST_P(DBIteratorTestForPinnedData, PinnedDataIteratorRandomizedCLoseAndOpen) { + PinnedDataIteratorRandomized(TestConfig::CLOSE_AND_OPEN); +} + +TEST_P(DBIteratorTestForPinnedData, + PinnedDataIteratorRandomizedCompactBeforeRead) { + PinnedDataIteratorRandomized(TestConfig::COMPACT_BEFORE_READ); +} + +TEST_P(DBIteratorTestForPinnedData, PinnedDataIteratorRandomizedFlush) { + PinnedDataIteratorRandomized(TestConfig::FLUSH_EVERY_1000); +} + +#ifndef ROCKSDB_LITE +TEST_P(DBIteratorTest, PinnedDataIteratorMultipleFiles) { + Options options = CurrentOptions(); + BlockBasedTableOptions table_options; + table_options.use_delta_encoding = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.disable_auto_compactions = true; + options.write_buffer_size = 1024 * 1024 * 10; // 10 Mb + DestroyAndReopen(options); + + std::map true_data; + + // Generate 4 sst files in L2 + Random rnd(301); + for (int i = 1; i <= 1000; i++) { + std::string k = Key(i * 3); + std::string v = RandomString(&rnd, 100); + ASSERT_OK(Put(k, v)); + true_data[k] = v; + if (i % 250 == 0) { + ASSERT_OK(Flush()); + } + } + ASSERT_EQ(FilesPerLevel(0), "4"); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(FilesPerLevel(0), "0,4"); + + // Generate 4 sst files in L0 + for (int i = 1; i <= 1000; i++) { + std::string k = Key(i * 2); + std::string v = RandomString(&rnd, 100); + ASSERT_OK(Put(k, v)); + true_data[k] = v; + if (i % 250 == 0) { + ASSERT_OK(Flush()); + } + } + ASSERT_EQ(FilesPerLevel(0), "4,4"); + + // Add some keys/values in memtables + for (int i = 1; i <= 1000; i++) { + std::string k = Key(i); + std::string v = RandomString(&rnd, 100); + ASSERT_OK(Put(k, v)); + true_data[k] = v; + } + ASSERT_EQ(FilesPerLevel(0), "4,4"); + + ReadOptions ro; + ro.pin_data = true; + auto iter = NewIterator(ro); + + std::vector> results; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + std::string prop_value; + ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("1", prop_value); + results.emplace_back(iter->key(), iter->value().ToString()); + } + + ASSERT_EQ(results.size(), true_data.size()); + auto data_iter = true_data.begin(); + for (size_t i = 0; i < results.size(); i++, data_iter++) { + auto& kv = results[i]; + ASSERT_EQ(kv.first, data_iter->first); + ASSERT_EQ(kv.second, data_iter->second); + } + + delete iter; +} +#endif + +TEST_P(DBIteratorTest, PinnedDataIteratorMergeOperator) { + Options options = CurrentOptions(); + BlockBasedTableOptions table_options; + table_options.use_delta_encoding = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.merge_operator = MergeOperators::CreateUInt64AddOperator(); + DestroyAndReopen(options); + + std::string numbers[7]; + for (int val = 0; val <= 6; val++) { + PutFixed64(numbers + val, val); + } + + // +1 all keys in range [ 0 => 999] + for (int i = 0; i < 1000; i++) { + WriteOptions wo; + ASSERT_OK(db_->Merge(wo, Key(i), numbers[1])); + } + + // +2 all keys divisible by 2 in range [ 0 => 999] + for (int i = 0; i < 1000; i += 2) { + WriteOptions wo; + ASSERT_OK(db_->Merge(wo, Key(i), numbers[2])); + } + + // +3 all keys divisible by 5 in range [ 0 => 999] + for (int i = 0; i < 1000; i += 5) { + WriteOptions wo; + ASSERT_OK(db_->Merge(wo, Key(i), numbers[3])); + } + + ReadOptions ro; + ro.pin_data = true; + auto iter = NewIterator(ro); + + std::vector> results; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + std::string prop_value; + ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("1", prop_value); + results.emplace_back(iter->key(), iter->value().ToString()); + } + + ASSERT_EQ(results.size(), 1000); + for (size_t i = 0; i < results.size(); i++) { + auto& kv = results[i]; + ASSERT_EQ(kv.first, Key(static_cast(i))); + int expected_val = 1; + if (i % 2 == 0) { + expected_val += 2; + } + if (i % 5 == 0) { + expected_val += 3; + } + ASSERT_EQ(kv.second, numbers[expected_val]); + } + + delete iter; +} + +TEST_P(DBIteratorTest, PinnedDataIteratorReadAfterUpdate) { + Options options = CurrentOptions(); + BlockBasedTableOptions table_options; + table_options.use_delta_encoding = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.write_buffer_size = 100000; + DestroyAndReopen(options); + + Random rnd(301); + + std::map true_data; + for (int i = 0; i < 1000; i++) { + std::string k = RandomString(&rnd, 10); + std::string v = RandomString(&rnd, 1000); + ASSERT_OK(Put(k, v)); + true_data[k] = v; + } + + ReadOptions ro; + ro.pin_data = true; + auto iter = NewIterator(ro); + + // Delete 50% of the keys and update the other 50% + for (auto& kv : true_data) { + if (rnd.OneIn(2)) { + ASSERT_OK(Delete(kv.first)); + } else { + std::string new_val = RandomString(&rnd, 1000); + ASSERT_OK(Put(kv.first, new_val)); + } + } + + std::vector> results; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + std::string prop_value; + ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("1", prop_value); + results.emplace_back(iter->key(), iter->value().ToString()); + } + + auto data_iter = true_data.begin(); + for (size_t i = 0; i < results.size(); i++, data_iter++) { + auto& kv = results[i]; + ASSERT_EQ(kv.first, data_iter->first); + ASSERT_EQ(kv.second, data_iter->second); + } + + delete iter; +} + +class SliceTransformLimitedDomainGeneric : public SliceTransform { + const char* Name() const override { + return "SliceTransformLimitedDomainGeneric"; + } + + Slice Transform(const Slice& src) const override { + return Slice(src.data(), 1); + } + + bool InDomain(const Slice& src) const override { + // prefix will be x???? + return src.size() >= 1; + } + + bool InRange(const Slice& dst) const override { + // prefix will be x???? + return dst.size() == 1; + } +}; + +TEST_P(DBIteratorTest, IterSeekForPrevCrossingFiles) { + Options options = CurrentOptions(); + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.disable_auto_compactions = true; + // Enable prefix bloom for SST files + BlockBasedTableOptions table_options; + table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + ASSERT_OK(Put("a1", "va1")); + ASSERT_OK(Put("a2", "va2")); + ASSERT_OK(Put("a3", "va3")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("b1", "vb1")); + ASSERT_OK(Put("b2", "vb2")); + ASSERT_OK(Put("b3", "vb3")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("b4", "vb4")); + ASSERT_OK(Put("d1", "vd1")); + ASSERT_OK(Put("d2", "vd2")); + ASSERT_OK(Put("d4", "vd4")); + ASSERT_OK(Flush()); + + MoveFilesToLevel(1); + { + ReadOptions ro; + Iterator* iter = NewIterator(ro); + + iter->SeekForPrev("a4"); + ASSERT_EQ(iter->key().ToString(), "a3"); + ASSERT_EQ(iter->value().ToString(), "va3"); + + iter->SeekForPrev("c2"); + ASSERT_EQ(iter->key().ToString(), "b3"); + iter->SeekForPrev("d3"); + ASSERT_EQ(iter->key().ToString(), "d2"); + iter->SeekForPrev("b5"); + ASSERT_EQ(iter->key().ToString(), "b4"); + delete iter; + } + + { + ReadOptions ro; + ro.prefix_same_as_start = true; + Iterator* iter = NewIterator(ro); + iter->SeekForPrev("c2"); + ASSERT_TRUE(!iter->Valid()); + delete iter; + } +} + +TEST_P(DBIteratorTest, IterSeekForPrevCrossingFilesCustomPrefixExtractor) { + Options options = CurrentOptions(); + options.prefix_extractor = + std::make_shared(); + options.disable_auto_compactions = true; + // Enable prefix bloom for SST files + BlockBasedTableOptions table_options; + table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + ASSERT_OK(Put("a1", "va1")); + ASSERT_OK(Put("a2", "va2")); + ASSERT_OK(Put("a3", "va3")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("b1", "vb1")); + ASSERT_OK(Put("b2", "vb2")); + ASSERT_OK(Put("b3", "vb3")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("b4", "vb4")); + ASSERT_OK(Put("d1", "vd1")); + ASSERT_OK(Put("d2", "vd2")); + ASSERT_OK(Put("d4", "vd4")); + ASSERT_OK(Flush()); + + MoveFilesToLevel(1); + { + ReadOptions ro; + Iterator* iter = NewIterator(ro); + + iter->SeekForPrev("a4"); + ASSERT_EQ(iter->key().ToString(), "a3"); + ASSERT_EQ(iter->value().ToString(), "va3"); + + iter->SeekForPrev("c2"); + ASSERT_EQ(iter->key().ToString(), "b3"); + iter->SeekForPrev("d3"); + ASSERT_EQ(iter->key().ToString(), "d2"); + iter->SeekForPrev("b5"); + ASSERT_EQ(iter->key().ToString(), "b4"); + delete iter; + } + + { + ReadOptions ro; + ro.prefix_same_as_start = true; + Iterator* iter = NewIterator(ro); + iter->SeekForPrev("c2"); + ASSERT_TRUE(!iter->Valid()); + delete iter; + } +} + +TEST_P(DBIteratorTest, IterPrevKeyCrossingBlocks) { + Options options = CurrentOptions(); + BlockBasedTableOptions table_options; + table_options.block_size = 1; // every block will contain one entry + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.merge_operator = MergeOperators::CreateStringAppendTESTOperator(); + options.disable_auto_compactions = true; + options.max_sequential_skip_in_iterations = 8; + + DestroyAndReopen(options); + + // Putting such deletes will force DBIter::Prev() to fallback to a Seek + for (int file_num = 0; file_num < 10; file_num++) { + ASSERT_OK(Delete("key4")); + ASSERT_OK(Flush()); + } + + // First File containing 5 blocks of puts + ASSERT_OK(Put("key1", "val1.0")); + ASSERT_OK(Put("key2", "val2.0")); + ASSERT_OK(Put("key3", "val3.0")); + ASSERT_OK(Put("key4", "val4.0")); + ASSERT_OK(Put("key5", "val5.0")); + ASSERT_OK(Flush()); + + // Second file containing 9 blocks of merge operands + ASSERT_OK(db_->Merge(WriteOptions(), "key1", "val1.1")); + ASSERT_OK(db_->Merge(WriteOptions(), "key1", "val1.2")); + + ASSERT_OK(db_->Merge(WriteOptions(), "key2", "val2.1")); + ASSERT_OK(db_->Merge(WriteOptions(), "key2", "val2.2")); + ASSERT_OK(db_->Merge(WriteOptions(), "key2", "val2.3")); + + ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.1")); + ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.2")); + ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.3")); + ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.4")); + ASSERT_OK(Flush()); + + { + ReadOptions ro; + ro.fill_cache = false; + Iterator* iter = NewIterator(ro); + + iter->SeekToLast(); + ASSERT_EQ(iter->key().ToString(), "key5"); + ASSERT_EQ(iter->value().ToString(), "val5.0"); + + iter->Prev(); + ASSERT_EQ(iter->key().ToString(), "key4"); + ASSERT_EQ(iter->value().ToString(), "val4.0"); + + iter->Prev(); + ASSERT_EQ(iter->key().ToString(), "key3"); + ASSERT_EQ(iter->value().ToString(), "val3.0,val3.1,val3.2,val3.3,val3.4"); + + iter->Prev(); + ASSERT_EQ(iter->key().ToString(), "key2"); + ASSERT_EQ(iter->value().ToString(), "val2.0,val2.1,val2.2,val2.3"); + + iter->Prev(); + ASSERT_EQ(iter->key().ToString(), "key1"); + ASSERT_EQ(iter->value().ToString(), "val1.0,val1.1,val1.2"); + + delete iter; + } +} + +TEST_P(DBIteratorTest, IterPrevKeyCrossingBlocksRandomized) { + Options options = CurrentOptions(); + options.merge_operator = MergeOperators::CreateStringAppendTESTOperator(); + options.disable_auto_compactions = true; + options.level0_slowdown_writes_trigger = (1 << 30); + options.level0_stop_writes_trigger = (1 << 30); + options.max_sequential_skip_in_iterations = 8; + DestroyAndReopen(options); + + const int kNumKeys = 500; + // Small number of merge operands to make sure that DBIter::Prev() dont + // fall back to Seek() + const int kNumMergeOperands = 3; + // Use value size that will make sure that every block contain 1 key + const int kValSize = + static_cast(BlockBasedTableOptions().block_size) * 4; + // Percentage of keys that wont get merge operations + const int kNoMergeOpPercentage = 20; + // Percentage of keys that will be deleted + const int kDeletePercentage = 10; + + // For half of the key range we will write multiple deletes first to + // force DBIter::Prev() to fall back to Seek() + for (int file_num = 0; file_num < 10; file_num++) { + for (int i = 0; i < kNumKeys; i += 2) { + ASSERT_OK(Delete(Key(i))); + } + ASSERT_OK(Flush()); + } + + Random rnd(301); + std::map true_data; + std::string gen_key; + std::string gen_val; + + for (int i = 0; i < kNumKeys; i++) { + gen_key = Key(i); + gen_val = RandomString(&rnd, kValSize); + + ASSERT_OK(Put(gen_key, gen_val)); + true_data[gen_key] = gen_val; + } + ASSERT_OK(Flush()); + + // Separate values and merge operands in different file so that we + // make sure that we dont merge them while flushing but actually + // merge them in the read path + for (int i = 0; i < kNumKeys; i++) { + if (rnd.PercentTrue(kNoMergeOpPercentage)) { + // Dont give merge operations for some keys + continue; + } + + for (int j = 0; j < kNumMergeOperands; j++) { + gen_key = Key(i); + gen_val = RandomString(&rnd, kValSize); + + ASSERT_OK(db_->Merge(WriteOptions(), gen_key, gen_val)); + true_data[gen_key] += "," + gen_val; + } + } + ASSERT_OK(Flush()); + + for (int i = 0; i < kNumKeys; i++) { + if (rnd.PercentTrue(kDeletePercentage)) { + gen_key = Key(i); + + ASSERT_OK(Delete(gen_key)); + true_data.erase(gen_key); + } + } + ASSERT_OK(Flush()); + + { + ReadOptions ro; + ro.fill_cache = false; + Iterator* iter = NewIterator(ro); + auto data_iter = true_data.rbegin(); + + for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { + ASSERT_EQ(iter->key().ToString(), data_iter->first); + ASSERT_EQ(iter->value().ToString(), data_iter->second); + data_iter++; + } + ASSERT_EQ(data_iter, true_data.rend()); + + delete iter; + } + + { + ReadOptions ro; + ro.fill_cache = false; + Iterator* iter = NewIterator(ro); + auto data_iter = true_data.rbegin(); + + int entries_right = 0; + std::string seek_key; + for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { + // Verify key/value of current position + ASSERT_EQ(iter->key().ToString(), data_iter->first); + ASSERT_EQ(iter->value().ToString(), data_iter->second); + + bool restore_position_with_seek = rnd.Uniform(2); + if (restore_position_with_seek) { + seek_key = iter->key().ToString(); + } + + // Do some Next() operations the restore the iterator to orignal position + int next_count = + entries_right > 0 ? rnd.Uniform(std::min(entries_right, 10)) : 0; + for (int i = 0; i < next_count; i++) { + iter->Next(); + data_iter--; + + ASSERT_EQ(iter->key().ToString(), data_iter->first); + ASSERT_EQ(iter->value().ToString(), data_iter->second); + } + + if (restore_position_with_seek) { + // Restore orignal position using Seek() + iter->Seek(seek_key); + for (int i = 0; i < next_count; i++) { + data_iter++; + } + + ASSERT_EQ(iter->key().ToString(), data_iter->first); + ASSERT_EQ(iter->value().ToString(), data_iter->second); + } else { + // Restore original position using Prev() + for (int i = 0; i < next_count; i++) { + iter->Prev(); + data_iter++; + + ASSERT_EQ(iter->key().ToString(), data_iter->first); + ASSERT_EQ(iter->value().ToString(), data_iter->second); + } + } + + entries_right++; + data_iter++; + } + ASSERT_EQ(data_iter, true_data.rend()); + + delete iter; + } +} + +TEST_P(DBIteratorTest, IteratorWithLocalStatistics) { + Options options = CurrentOptions(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < 1000; i++) { + // Key 10 bytes / Value 10 bytes + ASSERT_OK(Put(RandomString(&rnd, 10), RandomString(&rnd, 10))); + } + + std::atomic total_next(0); + std::atomic total_next_found(0); + std::atomic total_prev(0); + std::atomic total_prev_found(0); + std::atomic total_bytes(0); + + std::vector threads; + std::function reader_func_next = [&]() { + SetPerfLevel(kEnableCount); + get_perf_context()->Reset(); + Iterator* iter = NewIterator(ReadOptions()); + + iter->SeekToFirst(); + // Seek will bump ITER_BYTES_READ + uint64_t bytes = 0; + bytes += iter->key().size(); + bytes += iter->value().size(); + while (true) { + iter->Next(); + total_next++; + + if (!iter->Valid()) { + break; + } + total_next_found++; + bytes += iter->key().size(); + bytes += iter->value().size(); + } + + delete iter; + ASSERT_EQ(bytes, get_perf_context()->iter_read_bytes); + SetPerfLevel(kDisable); + total_bytes += bytes; + }; + + std::function reader_func_prev = [&]() { + SetPerfLevel(kEnableCount); + Iterator* iter = NewIterator(ReadOptions()); + + iter->SeekToLast(); + // Seek will bump ITER_BYTES_READ + uint64_t bytes = 0; + bytes += iter->key().size(); + bytes += iter->value().size(); + while (true) { + iter->Prev(); + total_prev++; + + if (!iter->Valid()) { + break; + } + total_prev_found++; + bytes += iter->key().size(); + bytes += iter->value().size(); + } + + delete iter; + ASSERT_EQ(bytes, get_perf_context()->iter_read_bytes); + SetPerfLevel(kDisable); + total_bytes += bytes; + }; + + for (int i = 0; i < 10; i++) { + threads.emplace_back(reader_func_next); + } + for (int i = 0; i < 15; i++) { + threads.emplace_back(reader_func_prev); + } + + for (auto& t : threads) { + t.join(); + } + + ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_NEXT), (uint64_t)total_next); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_NEXT_FOUND), + (uint64_t)total_next_found); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_PREV), (uint64_t)total_prev); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_PREV_FOUND), + (uint64_t)total_prev_found); + ASSERT_EQ(TestGetTickerCount(options, ITER_BYTES_READ), (uint64_t)total_bytes); + +} + +TEST_P(DBIteratorTest, ReadAhead) { + Options options; + env_->count_random_reads_ = true; + options.env = env_; + options.disable_auto_compactions = true; + options.write_buffer_size = 4 << 20; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.block_size = 1024; + table_options.no_block_cache = true; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + Reopen(options); + + std::string value(1024, 'a'); + for (int i = 0; i < 100; i++) { + Put(Key(i), value); + } + ASSERT_OK(Flush()); + MoveFilesToLevel(2); + + for (int i = 0; i < 100; i++) { + Put(Key(i), value); + } + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + + for (int i = 0; i < 100; i++) { + Put(Key(i), value); + } + ASSERT_OK(Flush()); +#ifndef ROCKSDB_LITE + ASSERT_EQ("1,1,1", FilesPerLevel()); +#endif // !ROCKSDB_LITE + + env_->random_read_bytes_counter_ = 0; + options.statistics->setTickerCount(NO_FILE_OPENS, 0); + ReadOptions read_options; + auto* iter = NewIterator(read_options); + iter->SeekToFirst(); + int64_t num_file_opens = TestGetTickerCount(options, NO_FILE_OPENS); + size_t bytes_read = env_->random_read_bytes_counter_; + delete iter; + + int64_t num_file_closes = TestGetTickerCount(options, NO_FILE_CLOSES); + env_->random_read_bytes_counter_ = 0; + options.statistics->setTickerCount(NO_FILE_OPENS, 0); + read_options.readahead_size = 1024 * 10; + iter = NewIterator(read_options); + iter->SeekToFirst(); + int64_t num_file_opens_readahead = TestGetTickerCount(options, NO_FILE_OPENS); + size_t bytes_read_readahead = env_->random_read_bytes_counter_; + delete iter; + int64_t num_file_closes_readahead = + TestGetTickerCount(options, NO_FILE_CLOSES); + ASSERT_EQ(num_file_opens, num_file_opens_readahead); + ASSERT_EQ(num_file_closes, num_file_closes_readahead); + ASSERT_GT(bytes_read_readahead, bytes_read); + ASSERT_GT(bytes_read_readahead, read_options.readahead_size * 3); + + // Verify correctness. + iter = NewIterator(read_options); + int count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_EQ(value, iter->value()); + count++; + } + ASSERT_EQ(100, count); + for (int i = 0; i < 100; i++) { + iter->Seek(Key(i)); + ASSERT_EQ(value, iter->value()); + } + delete iter; +} + +// Insert a key, create a snapshot iterator, overwrite key lots of times, +// seek to a smaller key. Expect DBIter to fall back to a seek instead of +// going through all the overwrites linearly. +TEST_P(DBIteratorTest, DBIteratorSkipRecentDuplicatesTest) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.max_sequential_skip_in_iterations = 3; + options.prefix_extractor = nullptr; + options.write_buffer_size = 1 << 27; // big enough to avoid flush + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + DestroyAndReopen(options); + + // Insert. + ASSERT_OK(Put("b", "0")); + + // Create iterator. + ReadOptions ro; + std::unique_ptr iter(NewIterator(ro)); + + // Insert a lot. + for (int i = 0; i < 100; ++i) { + ASSERT_OK(Put("b", std::to_string(i + 1).c_str())); + } + +#ifndef ROCKSDB_LITE + // Check that memtable wasn't flushed. + std::string val; + ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level0", &val)); + EXPECT_EQ("0", val); +#endif + + // Seek iterator to a smaller key. + get_perf_context()->Reset(); + iter->Seek("a"); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ("b", iter->key().ToString()); + EXPECT_EQ("0", iter->value().ToString()); + + // Check that the seek didn't do too much work. + // Checks are not tight, just make sure that everything is well below 100. + EXPECT_LT(get_perf_context()->internal_key_skipped_count, 4); + EXPECT_LT(get_perf_context()->internal_recent_skipped_count, 8); + EXPECT_LT(get_perf_context()->seek_on_memtable_count, 10); + EXPECT_LT(get_perf_context()->next_on_memtable_count, 10); + EXPECT_LT(get_perf_context()->prev_on_memtable_count, 10); + + // Check that iterator did something like what we expect. + EXPECT_EQ(get_perf_context()->internal_delete_skipped_count, 0); + EXPECT_EQ(get_perf_context()->internal_merge_count, 0); + EXPECT_GE(get_perf_context()->internal_recent_skipped_count, 2); + EXPECT_GE(get_perf_context()->seek_on_memtable_count, 2); + EXPECT_EQ(1, options.statistics->getTickerCount( + NUMBER_OF_RESEEKS_IN_ITERATION)); +} + +TEST_P(DBIteratorTest, Refresh) { + ASSERT_OK(Put("x", "y")); + + std::unique_ptr iter(NewIterator(ReadOptions())); + iter->Seek(Slice("a")); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("x")), 0); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + + ASSERT_OK(Put("c", "d")); + + iter->Seek(Slice("a")); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("x")), 0); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + + iter->Refresh(); + + iter->Seek(Slice("a")); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("c")), 0); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("x")), 0); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + + dbfull()->Flush(FlushOptions()); + + ASSERT_OK(Put("m", "n")); + + iter->Seek(Slice("a")); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("c")), 0); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("x")), 0); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + + iter->Refresh(); + + iter->Seek(Slice("a")); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("c")), 0); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("m")), 0); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("x")), 0); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + + iter.reset(); +} + +TEST_P(DBIteratorTest, RefreshWithSnapshot) { + ASSERT_OK(Put("x", "y")); + const Snapshot* snapshot = db_->GetSnapshot(); + ReadOptions options; + options.snapshot = snapshot; + Iterator* iter = NewIterator(options); + + iter->Seek(Slice("a")); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("x")), 0); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + + ASSERT_OK(Put("c", "d")); + + iter->Seek(Slice("a")); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("x")), 0); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + + Status s; + s = iter->Refresh(); + ASSERT_TRUE(s.IsNotSupported()); + db_->ReleaseSnapshot(snapshot); + delete iter; +} + +TEST_P(DBIteratorTest, CreationFailure) { + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::NewInternalIterator:StatusCallback", [](void* arg) { + *(reinterpret_cast(arg)) = Status::Corruption("test status"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + Iterator* iter = NewIterator(ReadOptions()); + ASSERT_FALSE(iter->Valid()); + ASSERT_TRUE(iter->status().IsCorruption()); + delete iter; +} + +TEST_P(DBIteratorTest, UpperBoundWithChangeDirection) { + Options options = CurrentOptions(); + options.max_sequential_skip_in_iterations = 3; + DestroyAndReopen(options); + + // write a bunch of kvs to the database. + ASSERT_OK(Put("a", "1")); + ASSERT_OK(Put("y", "1")); + ASSERT_OK(Put("y1", "1")); + ASSERT_OK(Put("y2", "1")); + ASSERT_OK(Put("y3", "1")); + ASSERT_OK(Put("z", "1")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("a", "1")); + ASSERT_OK(Put("z", "1")); + ASSERT_OK(Put("bar", "1")); + ASSERT_OK(Put("foo", "1")); + + std::string upper_bound = "x"; + Slice ub_slice(upper_bound); + ReadOptions ro; + ro.iterate_upper_bound = &ub_slice; + ro.max_skippable_internal_keys = 1000; + + Iterator* iter = NewIterator(ro); + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("bar", iter->key().ToString()); + + delete iter; +} + +TEST_P(DBIteratorTest, TableFilter) { + ASSERT_OK(Put("a", "1")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("b", "2")); + ASSERT_OK(Put("c", "3")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("d", "4")); + ASSERT_OK(Put("e", "5")); + ASSERT_OK(Put("f", "6")); + dbfull()->Flush(FlushOptions()); + + // Ensure the table_filter callback is called once for each table. + { + std::set unseen{1, 2, 3}; + ReadOptions opts; + opts.table_filter = [&](const TableProperties& props) { + auto it = unseen.find(props.num_entries); + if (it == unseen.end()) { + ADD_FAILURE() << "saw table properties with an unexpected " + << props.num_entries << " entries"; + } else { + unseen.erase(it); + } + return true; + }; + auto iter = NewIterator(opts); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->1"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->2"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->3"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "d->4"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "e->5"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "f->6"); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_TRUE(unseen.empty()); + delete iter; + } + + // Ensure returning false in the table_filter hides the keys from that table + // during iteration. + { + ReadOptions opts; + opts.table_filter = [](const TableProperties& props) { + return props.num_entries != 2; + }; + auto iter = NewIterator(opts); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->1"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "d->4"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "e->5"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "f->6"); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + delete iter; + } +} + +TEST_P(DBIteratorTest, UpperBoundWithPrevReseek) { + Options options = CurrentOptions(); + options.max_sequential_skip_in_iterations = 3; + DestroyAndReopen(options); + + // write a bunch of kvs to the database. + ASSERT_OK(Put("a", "1")); + ASSERT_OK(Put("y", "1")); + ASSERT_OK(Put("z", "1")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("a", "1")); + ASSERT_OK(Put("z", "1")); + ASSERT_OK(Put("bar", "1")); + ASSERT_OK(Put("foo", "1")); + ASSERT_OK(Put("foo", "2")); + + ASSERT_OK(Put("foo", "3")); + ASSERT_OK(Put("foo", "4")); + ASSERT_OK(Put("foo", "5")); + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK(Put("foo", "6")); + + std::string upper_bound = "x"; + Slice ub_slice(upper_bound); + ReadOptions ro; + ro.snapshot = snapshot; + ro.iterate_upper_bound = &ub_slice; + + Iterator* iter = NewIterator(ro); + iter->SeekForPrev("goo"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + iter->Prev(); + + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bar", iter->key().ToString()); + + delete iter; + db_->ReleaseSnapshot(snapshot); +} + +TEST_P(DBIteratorTest, SkipStatistics) { + Options options = CurrentOptions(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + DestroyAndReopen(options); + + int skip_count = 0; + + // write a bunch of kvs to the database. + ASSERT_OK(Put("a", "1")); + ASSERT_OK(Put("b", "1")); + ASSERT_OK(Put("c", "1")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("d", "1")); + ASSERT_OK(Put("e", "1")); + ASSERT_OK(Put("f", "1")); + ASSERT_OK(Put("a", "2")); + ASSERT_OK(Put("b", "2")); + ASSERT_OK(Flush()); + ASSERT_OK(Delete("d")); + ASSERT_OK(Delete("e")); + ASSERT_OK(Delete("f")); + + Iterator* iter = NewIterator(ReadOptions()); + int count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + count++; + } + ASSERT_EQ(count, 3); + delete iter; + skip_count += 8; // 3 deletes + 3 original keys + 2 lower in sequence + ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP)); + + iter = NewIterator(ReadOptions()); + count = 0; + for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { + ASSERT_OK(iter->status()); + count++; + } + ASSERT_EQ(count, 3); + delete iter; + skip_count += 8; // Same as above, but in reverse order + ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP)); + + ASSERT_OK(Put("aa", "1")); + ASSERT_OK(Put("ab", "1")); + ASSERT_OK(Put("ac", "1")); + ASSERT_OK(Put("ad", "1")); + ASSERT_OK(Flush()); + ASSERT_OK(Delete("ab")); + ASSERT_OK(Delete("ac")); + ASSERT_OK(Delete("ad")); + + ReadOptions ro; + Slice prefix("b"); + ro.iterate_upper_bound = &prefix; + + iter = NewIterator(ro); + count = 0; + for(iter->Seek("aa"); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + count++; + } + ASSERT_EQ(count, 1); + delete iter; + skip_count += 6; // 3 deletes + 3 original keys + ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP)); + + iter = NewIterator(ro); + count = 0; + for(iter->SeekToLast(); iter->Valid(); iter->Prev()) { + ASSERT_OK(iter->status()); + count++; + } + ASSERT_EQ(count, 2); + delete iter; + // 3 deletes + 3 original keys + lower sequence of "a" + skip_count += 7; + ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP)); +} + +TEST_P(DBIteratorTest, SeekAfterHittingManyInternalKeys) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + ReadOptions ropts; + ropts.max_skippable_internal_keys = 2; + + Put("1", "val_1"); + // Add more tombstones than max_skippable_internal_keys so that Next() fails. + Delete("2"); + Delete("3"); + Delete("4"); + Delete("5"); + Put("6", "val_6"); + + std::unique_ptr iter(NewIterator(ropts)); + iter->SeekToFirst(); + + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), "1"); + ASSERT_EQ(iter->value().ToString(), "val_1"); + + // This should fail as incomplete due to too many non-visible internal keys on + // the way to the next valid user key. + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + ASSERT_TRUE(iter->status().IsIncomplete()); + + // Get the internal key at which Next() failed. + std::string prop_value; + ASSERT_OK(iter->GetProperty("rocksdb.iterator.internal-key", &prop_value)); + ASSERT_EQ("4", prop_value); + + // Create a new iterator to seek to the internal key. + std::unique_ptr iter2(NewIterator(ropts)); + iter2->Seek(prop_value); + ASSERT_TRUE(iter2->Valid()); + ASSERT_OK(iter2->status()); + + ASSERT_EQ(iter2->key().ToString(), "6"); + ASSERT_EQ(iter2->value().ToString(), "val_6"); +} + +// Reproduces a former bug where iterator would skip some records when DBIter +// re-seeks subiterator with Incomplete status. +TEST_P(DBIteratorTest, NonBlockingIterationBugRepro) { + Options options = CurrentOptions(); + BlockBasedTableOptions table_options; + // Make sure the sst file has more than one block. + table_options.flush_block_policy_factory = + std::make_shared(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + // Two records in sst file, each in its own block. + Put("b", ""); + Put("d", ""); + Flush(); + + // Create a nonblocking iterator before writing to memtable. + ReadOptions ropt; + ropt.read_tier = kBlockCacheTier; + std::unique_ptr iter(NewIterator(ropt)); + + // Overwrite a key in memtable many times to hit + // max_sequential_skip_in_iterations (which is 8 by default). + for (int i = 0; i < 20; ++i) { + Put("c", ""); + } + + // Load the second block in sst file into the block cache. + { + std::unique_ptr iter2(NewIterator(ReadOptions())); + iter2->Seek("d"); + } + + // Finally seek the nonblocking iterator. + iter->Seek("a"); + // With the bug, the status used to be OK, and the iterator used to point to + // "d". + EXPECT_TRUE(iter->status().IsIncomplete()); +} + +TEST_P(DBIteratorTest, SeekBackwardAfterOutOfUpperBound) { + Put("a", ""); + Put("b", ""); + Flush(); + + ReadOptions ropt; + Slice ub = "b"; + ropt.iterate_upper_bound = &ub; + + std::unique_ptr it(dbfull()->NewIterator(ropt)); + it->SeekForPrev("a"); + ASSERT_TRUE(it->Valid()); + ASSERT_OK(it->status()); + ASSERT_EQ("a", it->key().ToString()); + it->Next(); + ASSERT_FALSE(it->Valid()); + ASSERT_OK(it->status()); + it->SeekForPrev("a"); + ASSERT_OK(it->status()); + + ASSERT_TRUE(it->Valid()); + ASSERT_EQ("a", it->key().ToString()); +} + +TEST_P(DBIteratorTest, AvoidReseekLevelIterator) { + Options options = CurrentOptions(); + options.compression = CompressionType::kNoCompression; + BlockBasedTableOptions table_options; + table_options.block_size = 800; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + + Random rnd(301); + std::string random_str = RandomString(&rnd, 180); + + ASSERT_OK(Put("1", random_str)); + ASSERT_OK(Put("2", random_str)); + ASSERT_OK(Put("3", random_str)); + ASSERT_OK(Put("4", random_str)); + // A new block + ASSERT_OK(Put("5", random_str)); + ASSERT_OK(Put("6", random_str)); + ASSERT_OK(Put("7", random_str)); + ASSERT_OK(Flush()); + ASSERT_OK(Put("8", random_str)); + ASSERT_OK(Put("9", random_str)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + int num_find_file_in_level = 0; + int num_idx_blk_seek = 0; + SyncPoint::GetInstance()->SetCallBack( + "LevelIterator::Seek:BeforeFindFile", + [&](void* /*arg*/) { num_find_file_in_level++; }); + SyncPoint::GetInstance()->SetCallBack( + "IndexBlockIter::Seek:0", [&](void* /*arg*/) { num_idx_blk_seek++; }); + SyncPoint::GetInstance()->EnableProcessing(); + + { + std::unique_ptr iter(NewIterator(ReadOptions())); + iter->Seek("1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(1, num_find_file_in_level); + ASSERT_EQ(1, num_idx_blk_seek); + + iter->Seek("2"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(1, num_find_file_in_level); + ASSERT_EQ(1, num_idx_blk_seek); + + iter->Seek("3"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(1, num_find_file_in_level); + ASSERT_EQ(1, num_idx_blk_seek); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(1, num_find_file_in_level); + ASSERT_EQ(1, num_idx_blk_seek); + + iter->Seek("5"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(1, num_find_file_in_level); + ASSERT_EQ(2, num_idx_blk_seek); + + iter->Seek("6"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(1, num_find_file_in_level); + ASSERT_EQ(2, num_idx_blk_seek); + + iter->Seek("7"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(1, num_find_file_in_level); + ASSERT_EQ(3, num_idx_blk_seek); + + iter->Seek("8"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(2, num_find_file_in_level); + // Still re-seek because "8" is the boundary key, which has + // the same user key as the seek key. + ASSERT_EQ(4, num_idx_blk_seek); + + iter->Seek("5"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(3, num_find_file_in_level); + ASSERT_EQ(5, num_idx_blk_seek); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(3, num_find_file_in_level); + ASSERT_EQ(5, num_idx_blk_seek); + + // Seek backward never triggers the index block seek to be skipped + iter->Seek("5"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(3, num_find_file_in_level); + ASSERT_EQ(6, num_idx_blk_seek); + } + + SyncPoint::GetInstance()->DisableProcessing(); +} + +// MyRocks may change iterate bounds before seek. Simply test to make sure such +// usage doesn't break iterator. +TEST_P(DBIteratorTest, IterateBoundChangedBeforeSeek) { + Options options = CurrentOptions(); + options.compression = CompressionType::kNoCompression; + BlockBasedTableOptions table_options; + table_options.block_size = 100; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + std::string value(50, 'v'); + Reopen(options); + ASSERT_OK(Put("aaa", value)); + ASSERT_OK(Flush()); + ASSERT_OK(Put("bbb", "v")); + ASSERT_OK(Put("ccc", "v")); + ASSERT_OK(Put("ddd", "v")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("eee", "v")); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + std::string ub1 = "e"; + std::string ub2 = "c"; + Slice ub(ub1); + ReadOptions read_opts1; + read_opts1.iterate_upper_bound = &ub; + Iterator* iter = NewIterator(read_opts1); + // Seek and iterate accross block boundary. + iter->Seek("b"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("bbb", iter->key()); + ub = Slice(ub2); + iter->Seek("b"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("bbb", iter->key()); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + delete iter; + + std::string lb1 = "a"; + std::string lb2 = "c"; + Slice lb(lb1); + ReadOptions read_opts2; + read_opts2.iterate_lower_bound = &lb; + iter = NewIterator(read_opts2); + iter->SeekForPrev("d"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("ccc", iter->key()); + lb = Slice(lb2); + iter->SeekForPrev("d"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("ccc", iter->key()); + iter->Prev(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + delete iter; +} + +TEST_P(DBIteratorTest, IterateWithLowerBoundAcrossFileBoundary) { + ASSERT_OK(Put("aaa", "v")); + ASSERT_OK(Put("bbb", "v")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("ccc", "v")); + ASSERT_OK(Put("ddd", "v")); + ASSERT_OK(Flush()); + // Move both files to bottom level. + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + Slice lower_bound("b"); + ReadOptions read_opts; + read_opts.iterate_lower_bound = &lower_bound; + std::unique_ptr iter(NewIterator(read_opts)); + iter->SeekForPrev("d"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("ccc", iter->key()); + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("bbb", iter->key()); + iter->Prev(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); +} + +INSTANTIATE_TEST_CASE_P(DBIteratorTestInstance, DBIteratorTest, + testing::Values(true, false)); + +// Tests how DBIter work with ReadCallback +class DBIteratorWithReadCallbackTest : public DBIteratorTest {}; + +TEST_F(DBIteratorWithReadCallbackTest, ReadCallback) { + class TestReadCallback : public ReadCallback { + public: + explicit TestReadCallback(SequenceNumber _max_visible_seq) + : ReadCallback(_max_visible_seq) {} + + bool IsVisibleFullCheck(SequenceNumber seq) override { + return seq <= max_visible_seq_; + } + }; + + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Put("foo", "v2")); + ASSERT_OK(Put("foo", "v3")); + ASSERT_OK(Put("a", "va")); + ASSERT_OK(Put("z", "vz")); + SequenceNumber seq1 = db_->GetLatestSequenceNumber(); + TestReadCallback callback1(seq1); + ASSERT_OK(Put("foo", "v4")); + ASSERT_OK(Put("foo", "v5")); + ASSERT_OK(Put("bar", "v7")); + + SequenceNumber seq2 = db_->GetLatestSequenceNumber(); + auto* cfd = + reinterpret_cast(db_->DefaultColumnFamily()) + ->cfd(); + // The iterator are suppose to see data before seq1. + Iterator* iter = + dbfull()->NewIteratorImpl(ReadOptions(), cfd, seq2, &callback1); + + // Seek + // The latest value of "foo" before seq1 is "v3" + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("foo", iter->key()); + ASSERT_EQ("v3", iter->value()); + // "bar" is not visible to the iterator. It will move on to the next key + // "foo". + iter->Seek("bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("foo", iter->key()); + ASSERT_EQ("v3", iter->value()); + + // Next + // Seek to "a" + iter->Seek("a"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("va", iter->value()); + // "bar" is not visible to the iterator. It will move on to the next key + // "foo". + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("foo", iter->key()); + ASSERT_EQ("v3", iter->value()); + + // Prev + // Seek to "z" + iter->Seek("z"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("vz", iter->value()); + // The previous key is "foo", which is visible to the iterator. + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("foo", iter->key()); + ASSERT_EQ("v3", iter->value()); + // "bar" is not visible to the iterator. It will move on to the next key "a". + iter->Prev(); // skipping "bar" + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("a", iter->key()); + ASSERT_EQ("va", iter->value()); + + // SeekForPrev + // The previous key is "foo", which is visible to the iterator. + iter->SeekForPrev("y"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("foo", iter->key()); + ASSERT_EQ("v3", iter->value()); + // "bar" is not visible to the iterator. It will move on to the next key "a". + iter->SeekForPrev("bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("a", iter->key()); + ASSERT_EQ("va", iter->value()); + + delete iter; + + // Prev beyond max_sequential_skip_in_iterations + uint64_t num_versions = + CurrentOptions().max_sequential_skip_in_iterations + 10; + for (uint64_t i = 0; i < num_versions; i++) { + ASSERT_OK(Put("bar", ToString(i))); + } + SequenceNumber seq3 = db_->GetLatestSequenceNumber(); + TestReadCallback callback2(seq3); + ASSERT_OK(Put("bar", "v8")); + SequenceNumber seq4 = db_->GetLatestSequenceNumber(); + + // The iterator is suppose to see data before seq3. + iter = dbfull()->NewIteratorImpl(ReadOptions(), cfd, seq4, &callback2); + // Seek to "z", which is visible. + iter->Seek("z"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("vz", iter->value()); + // Previous key is "foo" and the last value "v5" is visible. + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("foo", iter->key()); + ASSERT_EQ("v5", iter->value()); + // Since the number of values of "bar" is more than + // max_sequential_skip_in_iterations, Prev() will ultimately fallback to + // seek in forward direction. Here we test the fallback seek is correct. + // The last visible value should be (num_versions - 1), as "v8" is not + // visible. + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("bar", iter->key()); + ASSERT_EQ(ToString(num_versions - 1), iter->value()); + + delete iter; +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_log_iter_test.cc b/src/rocksdb/db/db_log_iter_test.cc new file mode 100644 index 000000000..1f9ff0d45 --- /dev/null +++ b/src/rocksdb/db/db_log_iter_test.cc @@ -0,0 +1,294 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// Introduction of SyncPoint effectively disabled building and running this test +// in Release build. +// which is a pity, it is a good test +#if !defined(ROCKSDB_LITE) + +#include "db/db_test_util.h" +#include "port/stack_trace.h" + +namespace ROCKSDB_NAMESPACE { + +class DBTestXactLogIterator : public DBTestBase { + public: + DBTestXactLogIterator() : DBTestBase("/db_log_iter_test") {} + + std::unique_ptr OpenTransactionLogIter( + const SequenceNumber seq) { + std::unique_ptr iter; + Status status = dbfull()->GetUpdatesSince(seq, &iter); + EXPECT_OK(status); + EXPECT_TRUE(iter->Valid()); + return iter; + } +}; + +namespace { +SequenceNumber ReadRecords( + std::unique_ptr& iter, + int& count) { + count = 0; + SequenceNumber lastSequence = 0; + BatchResult res; + while (iter->Valid()) { + res = iter->GetBatch(); + EXPECT_TRUE(res.sequence > lastSequence); + ++count; + lastSequence = res.sequence; + EXPECT_OK(iter->status()); + iter->Next(); + } + return res.sequence; +} + +void ExpectRecords( + const int expected_no_records, + std::unique_ptr& iter) { + int num_records; + ReadRecords(iter, num_records); + ASSERT_EQ(num_records, expected_no_records); +} +} // namespace + +TEST_F(DBTestXactLogIterator, TransactionLogIterator) { + do { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + Put(0, "key1", DummyString(1024)); + Put(1, "key2", DummyString(1024)); + Put(1, "key2", DummyString(1024)); + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3U); + { + auto iter = OpenTransactionLogIter(0); + ExpectRecords(3, iter); + } + ReopenWithColumnFamilies({"default", "pikachu"}, options); + env_->SleepForMicroseconds(2 * 1000 * 1000); + { + Put(0, "key4", DummyString(1024)); + Put(1, "key5", DummyString(1024)); + Put(0, "key6", DummyString(1024)); + } + { + auto iter = OpenTransactionLogIter(0); + ExpectRecords(6, iter); + } + } while (ChangeCompactOptions()); +} + +#ifndef NDEBUG // sync point is not included with DNDEBUG build +TEST_F(DBTestXactLogIterator, TransactionLogIteratorRace) { + static const int LOG_ITERATOR_RACE_TEST_COUNT = 2; + static const char* sync_points[LOG_ITERATOR_RACE_TEST_COUNT][4] = { + {"WalManager::GetSortedWalFiles:1", "WalManager::PurgeObsoleteFiles:1", + "WalManager::PurgeObsoleteFiles:2", "WalManager::GetSortedWalFiles:2"}, + {"WalManager::GetSortedWalsOfType:1", + "WalManager::PurgeObsoleteFiles:1", + "WalManager::PurgeObsoleteFiles:2", + "WalManager::GetSortedWalsOfType:2"}}; + for (int test = 0; test < LOG_ITERATOR_RACE_TEST_COUNT; ++test) { + // Setup sync point dependency to reproduce the race condition of + // a log file moved to archived dir, in the middle of GetSortedWalFiles + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {sync_points[test][0], sync_points[test][1]}, + {sync_points[test][2], sync_points[test][3]}, + }); + + do { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + Options options = OptionsForLogIterTest(); + DestroyAndReopen(options); + Put("key1", DummyString(1024)); + dbfull()->Flush(FlushOptions()); + Put("key2", DummyString(1024)); + dbfull()->Flush(FlushOptions()); + Put("key3", DummyString(1024)); + dbfull()->Flush(FlushOptions()); + Put("key4", DummyString(1024)); + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4U); + dbfull()->FlushWAL(false); + + { + auto iter = OpenTransactionLogIter(0); + ExpectRecords(4, iter); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + // trigger async flush, and log move. Well, log move will + // wait until the GetSortedWalFiles:1 to reproduce the race + // condition + FlushOptions flush_options; + flush_options.wait = false; + dbfull()->Flush(flush_options); + + // "key5" would be written in a new memtable and log + Put("key5", DummyString(1024)); + dbfull()->FlushWAL(false); + { + // this iter would miss "key4" if not fixed + auto iter = OpenTransactionLogIter(0); + ExpectRecords(5, iter); + } + } while (ChangeCompactOptions()); + } +} +#endif + +TEST_F(DBTestXactLogIterator, TransactionLogIteratorStallAtLastRecord) { + do { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(options); + Put("key1", DummyString(1024)); + auto iter = OpenTransactionLogIter(0); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); + Put("key2", DummyString(1024)); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + } while (ChangeCompactOptions()); +} + +TEST_F(DBTestXactLogIterator, TransactionLogIteratorCheckAfterRestart) { + do { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(options); + Put("key1", DummyString(1024)); + Put("key2", DummyString(1023)); + dbfull()->Flush(FlushOptions()); + Reopen(options); + auto iter = OpenTransactionLogIter(0); + ExpectRecords(2, iter); + } while (ChangeCompactOptions()); +} + +TEST_F(DBTestXactLogIterator, TransactionLogIteratorCorruptedLog) { + do { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(options); + for (int i = 0; i < 1024; i++) { + Put("key"+ToString(i), DummyString(10)); + } + dbfull()->Flush(FlushOptions()); + dbfull()->FlushWAL(false); + // Corrupt this log to create a gap + ROCKSDB_NAMESPACE::VectorLogPtr wal_files; + ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files)); + const auto logfile_path = dbname_ + "/" + wal_files.front()->PathName(); + if (mem_env_) { + mem_env_->Truncate(logfile_path, wal_files.front()->SizeFileBytes() / 2); + } else { + ASSERT_EQ(0, truncate(logfile_path.c_str(), + wal_files.front()->SizeFileBytes() / 2)); + } + + // Insert a new entry to a new log file + Put("key1025", DummyString(10)); + dbfull()->FlushWAL(false); + // Try to read from the beginning. Should stop before the gap and read less + // than 1025 entries + auto iter = OpenTransactionLogIter(0); + int count; + SequenceNumber last_sequence_read = ReadRecords(iter, count); + ASSERT_LT(last_sequence_read, 1025U); + // Try to read past the gap, should be able to seek to key1025 + auto iter2 = OpenTransactionLogIter(last_sequence_read + 1); + ExpectRecords(1, iter2); + } while (ChangeCompactOptions()); +} + +TEST_F(DBTestXactLogIterator, TransactionLogIteratorBatchOperations) { + do { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + WriteBatch batch; + batch.Put(handles_[1], "key1", DummyString(1024)); + batch.Put(handles_[0], "key2", DummyString(1024)); + batch.Put(handles_[1], "key3", DummyString(1024)); + batch.Delete(handles_[0], "key2"); + dbfull()->Write(WriteOptions(), &batch); + Flush(1); + Flush(0); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + Put(1, "key4", DummyString(1024)); + auto iter = OpenTransactionLogIter(3); + ExpectRecords(2, iter); + } while (ChangeCompactOptions()); +} + +TEST_F(DBTestXactLogIterator, TransactionLogIteratorBlobs) { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + { + WriteBatch batch; + batch.Put(handles_[1], "key1", DummyString(1024)); + batch.Put(handles_[0], "key2", DummyString(1024)); + batch.PutLogData(Slice("blob1")); + batch.Put(handles_[1], "key3", DummyString(1024)); + batch.PutLogData(Slice("blob2")); + batch.Delete(handles_[0], "key2"); + dbfull()->Write(WriteOptions(), &batch); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + } + + auto res = OpenTransactionLogIter(0)->GetBatch(); + struct Handler : public WriteBatch::Handler { + std::string seen; + Status PutCF(uint32_t cf, const Slice& key, const Slice& value) override { + seen += "Put(" + ToString(cf) + ", " + key.ToString() + ", " + + ToString(value.size()) + ")"; + return Status::OK(); + } + Status MergeCF(uint32_t cf, const Slice& key, const Slice& value) override { + seen += "Merge(" + ToString(cf) + ", " + key.ToString() + ", " + + ToString(value.size()) + ")"; + return Status::OK(); + } + void LogData(const Slice& blob) override { + seen += "LogData(" + blob.ToString() + ")"; + } + Status DeleteCF(uint32_t cf, const Slice& key) override { + seen += "Delete(" + ToString(cf) + ", " + key.ToString() + ")"; + return Status::OK(); + } + } handler; + res.writeBatchPtr->Iterate(&handler); + ASSERT_EQ( + "Put(1, key1, 1024)" + "Put(0, key2, 1024)" + "LogData(blob1)" + "Put(1, key3, 1024)" + "LogData(blob2)" + "Delete(0, key2)", + handler.seen); +} +} // namespace ROCKSDB_NAMESPACE + +#endif // !defined(ROCKSDB_LITE) + +int main(int argc, char** argv) { +#if !defined(ROCKSDB_LITE) + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +#else + (void) argc; + (void) argv; + return 0; +#endif +} diff --git a/src/rocksdb/db/db_memtable_test.cc b/src/rocksdb/db/db_memtable_test.cc new file mode 100644 index 000000000..a2f4e327c --- /dev/null +++ b/src/rocksdb/db/db_memtable_test.cc @@ -0,0 +1,340 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include +#include + +#include "db/db_test_util.h" +#include "db/memtable.h" +#include "db/range_del_aggregator.h" +#include "port/stack_trace.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/slice_transform.h" + +namespace ROCKSDB_NAMESPACE { + +class DBMemTableTest : public DBTestBase { + public: + DBMemTableTest() : DBTestBase("/db_memtable_test") {} +}; + +class MockMemTableRep : public MemTableRep { + public: + explicit MockMemTableRep(Allocator* allocator, MemTableRep* rep) + : MemTableRep(allocator), rep_(rep), num_insert_with_hint_(0) {} + + KeyHandle Allocate(const size_t len, char** buf) override { + return rep_->Allocate(len, buf); + } + + void Insert(KeyHandle handle) override { rep_->Insert(handle); } + + void InsertWithHint(KeyHandle handle, void** hint) override { + num_insert_with_hint_++; + EXPECT_NE(nullptr, hint); + last_hint_in_ = *hint; + rep_->InsertWithHint(handle, hint); + last_hint_out_ = *hint; + } + + bool Contains(const char* key) const override { return rep_->Contains(key); } + + void Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, const char* entry)) override { + rep_->Get(k, callback_args, callback_func); + } + + size_t ApproximateMemoryUsage() override { + return rep_->ApproximateMemoryUsage(); + } + + Iterator* GetIterator(Arena* arena) override { + return rep_->GetIterator(arena); + } + + void* last_hint_in() { return last_hint_in_; } + void* last_hint_out() { return last_hint_out_; } + int num_insert_with_hint() { return num_insert_with_hint_; } + + private: + std::unique_ptr rep_; + void* last_hint_in_; + void* last_hint_out_; + int num_insert_with_hint_; +}; + +class MockMemTableRepFactory : public MemTableRepFactory { + public: + MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator& cmp, + Allocator* allocator, + const SliceTransform* transform, + Logger* logger) override { + SkipListFactory factory; + MemTableRep* skiplist_rep = + factory.CreateMemTableRep(cmp, allocator, transform, logger); + mock_rep_ = new MockMemTableRep(allocator, skiplist_rep); + return mock_rep_; + } + + MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator& cmp, + Allocator* allocator, + const SliceTransform* transform, + Logger* logger, + uint32_t column_family_id) override { + last_column_family_id_ = column_family_id; + return CreateMemTableRep(cmp, allocator, transform, logger); + } + + const char* Name() const override { return "MockMemTableRepFactory"; } + + MockMemTableRep* rep() { return mock_rep_; } + + bool IsInsertConcurrentlySupported() const override { return false; } + + uint32_t GetLastColumnFamilyId() { return last_column_family_id_; } + + private: + MockMemTableRep* mock_rep_; + // workaround since there's no port::kMaxUint32 yet. + uint32_t last_column_family_id_ = static_cast(-1); +}; + +class TestPrefixExtractor : public SliceTransform { + public: + const char* Name() const override { return "TestPrefixExtractor"; } + + Slice Transform(const Slice& key) const override { + const char* p = separator(key); + if (p == nullptr) { + return Slice(); + } + return Slice(key.data(), p - key.data() + 1); + } + + bool InDomain(const Slice& key) const override { + return separator(key) != nullptr; + } + + bool InRange(const Slice& /*key*/) const override { return false; } + + private: + const char* separator(const Slice& key) const { + return reinterpret_cast(memchr(key.data(), '_', key.size())); + } +}; + +// Test that ::Add properly returns false when inserting duplicate keys +TEST_F(DBMemTableTest, DuplicateSeq) { + SequenceNumber seq = 123; + std::string value; + Status s; + MergeContext merge_context; + Options options; + InternalKeyComparator ikey_cmp(options.comparator); + ReadRangeDelAggregator range_del_agg(&ikey_cmp, + kMaxSequenceNumber /* upper_bound */); + + // Create a MemTable + InternalKeyComparator cmp(BytewiseComparator()); + auto factory = std::make_shared(); + options.memtable_factory = factory; + ImmutableCFOptions ioptions(options); + WriteBufferManager wb(options.db_write_buffer_size); + MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb, + kMaxSequenceNumber, 0 /* column_family_id */); + + // Write some keys and make sure it returns false on duplicates + bool res; + res = mem->Add(seq, kTypeValue, "key", "value2"); + ASSERT_TRUE(res); + res = mem->Add(seq, kTypeValue, "key", "value2"); + ASSERT_FALSE(res); + // Changing the type should still cause the duplicatae key + res = mem->Add(seq, kTypeMerge, "key", "value2"); + ASSERT_FALSE(res); + // Changing the seq number will make the key fresh + res = mem->Add(seq + 1, kTypeMerge, "key", "value2"); + ASSERT_TRUE(res); + // Test with different types for duplicate keys + res = mem->Add(seq, kTypeDeletion, "key", ""); + ASSERT_FALSE(res); + res = mem->Add(seq, kTypeSingleDeletion, "key", ""); + ASSERT_FALSE(res); + + // Test the duplicate keys under stress + for (int i = 0; i < 10000; i++) { + bool insert_dup = i % 10 == 1; + if (!insert_dup) { + seq++; + } + res = mem->Add(seq, kTypeValue, "foo", "value" + ToString(seq)); + if (insert_dup) { + ASSERT_FALSE(res); + } else { + ASSERT_TRUE(res); + } + } + delete mem; + + // Test with InsertWithHint + options.memtable_insert_with_hint_prefix_extractor.reset( + new TestPrefixExtractor()); // which uses _ to extract the prefix + ioptions = ImmutableCFOptions(options); + mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb, + kMaxSequenceNumber, 0 /* column_family_id */); + // Insert a duplicate key with _ in it + res = mem->Add(seq, kTypeValue, "key_1", "value"); + ASSERT_TRUE(res); + res = mem->Add(seq, kTypeValue, "key_1", "value"); + ASSERT_FALSE(res); + delete mem; + + // Test when InsertConcurrently will be invoked + options.allow_concurrent_memtable_write = true; + ioptions = ImmutableCFOptions(options); + mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb, + kMaxSequenceNumber, 0 /* column_family_id */); + MemTablePostProcessInfo post_process_info; + res = mem->Add(seq, kTypeValue, "key", "value", true, &post_process_info); + ASSERT_TRUE(res); + res = mem->Add(seq, kTypeValue, "key", "value", true, &post_process_info); + ASSERT_FALSE(res); + delete mem; +} + +// A simple test to verify that the concurrent merge writes is functional +TEST_F(DBMemTableTest, ConcurrentMergeWrite) { + int num_ops = 1000; + std::string value; + Status s; + MergeContext merge_context; + Options options; + // A merge operator that is not sensitive to concurrent writes since in this + // test we don't order the writes. + options.merge_operator = MergeOperators::CreateUInt64AddOperator(); + + // Create a MemTable + InternalKeyComparator cmp(BytewiseComparator()); + auto factory = std::make_shared(); + options.memtable_factory = factory; + options.allow_concurrent_memtable_write = true; + ImmutableCFOptions ioptions(options); + WriteBufferManager wb(options.db_write_buffer_size); + MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb, + kMaxSequenceNumber, 0 /* column_family_id */); + + // Put 0 as the base + PutFixed64(&value, static_cast(0)); + bool res = mem->Add(0, kTypeValue, "key", value); + ASSERT_TRUE(res); + value.clear(); + + // Write Merge concurrently + ROCKSDB_NAMESPACE::port::Thread write_thread1([&]() { + MemTablePostProcessInfo post_process_info1; + std::string v1; + for (int seq = 1; seq < num_ops / 2; seq++) { + PutFixed64(&v1, seq); + bool res1 = + mem->Add(seq, kTypeMerge, "key", v1, true, &post_process_info1); + ASSERT_TRUE(res1); + v1.clear(); + } + }); + ROCKSDB_NAMESPACE::port::Thread write_thread2([&]() { + MemTablePostProcessInfo post_process_info2; + std::string v2; + for (int seq = num_ops / 2; seq < num_ops; seq++) { + PutFixed64(&v2, seq); + bool res2 = + mem->Add(seq, kTypeMerge, "key", v2, true, &post_process_info2); + ASSERT_TRUE(res2); + v2.clear(); + } + }); + write_thread1.join(); + write_thread2.join(); + + Status status; + ReadOptions roptions; + SequenceNumber max_covering_tombstone_seq = 0; + LookupKey lkey("key", kMaxSequenceNumber); + res = mem->Get(lkey, &value, &status, &merge_context, + &max_covering_tombstone_seq, roptions); + ASSERT_TRUE(res); + uint64_t ivalue = DecodeFixed64(Slice(value).data()); + uint64_t sum = 0; + for (int seq = 0; seq < num_ops; seq++) { + sum += seq; + } + ASSERT_EQ(ivalue, sum); + + delete mem; +} + +TEST_F(DBMemTableTest, InsertWithHint) { + Options options; + options.allow_concurrent_memtable_write = false; + options.create_if_missing = true; + options.memtable_factory.reset(new MockMemTableRepFactory()); + options.memtable_insert_with_hint_prefix_extractor.reset( + new TestPrefixExtractor()); + options.env = env_; + Reopen(options); + MockMemTableRep* rep = + reinterpret_cast(options.memtable_factory.get()) + ->rep(); + ASSERT_OK(Put("foo_k1", "foo_v1")); + ASSERT_EQ(nullptr, rep->last_hint_in()); + void* hint_foo = rep->last_hint_out(); + ASSERT_OK(Put("foo_k2", "foo_v2")); + ASSERT_EQ(hint_foo, rep->last_hint_in()); + ASSERT_EQ(hint_foo, rep->last_hint_out()); + ASSERT_OK(Put("foo_k3", "foo_v3")); + ASSERT_EQ(hint_foo, rep->last_hint_in()); + ASSERT_EQ(hint_foo, rep->last_hint_out()); + ASSERT_OK(Put("bar_k1", "bar_v1")); + ASSERT_EQ(nullptr, rep->last_hint_in()); + void* hint_bar = rep->last_hint_out(); + ASSERT_NE(hint_foo, hint_bar); + ASSERT_OK(Put("bar_k2", "bar_v2")); + ASSERT_EQ(hint_bar, rep->last_hint_in()); + ASSERT_EQ(hint_bar, rep->last_hint_out()); + ASSERT_EQ(5, rep->num_insert_with_hint()); + ASSERT_OK(Put("whitelisted", "vvv")); + ASSERT_EQ(5, rep->num_insert_with_hint()); + ASSERT_EQ("foo_v1", Get("foo_k1")); + ASSERT_EQ("foo_v2", Get("foo_k2")); + ASSERT_EQ("foo_v3", Get("foo_k3")); + ASSERT_EQ("bar_v1", Get("bar_k1")); + ASSERT_EQ("bar_v2", Get("bar_k2")); + ASSERT_EQ("vvv", Get("whitelisted")); +} + +TEST_F(DBMemTableTest, ColumnFamilyId) { + // Verifies MemTableRepFactory is told the right column family id. + Options options; + options.allow_concurrent_memtable_write = false; + options.create_if_missing = true; + options.memtable_factory.reset(new MockMemTableRepFactory()); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + for (uint32_t cf = 0; cf < 2; ++cf) { + ASSERT_OK(Put(cf, "key", "val")); + ASSERT_OK(Flush(cf)); + ASSERT_EQ( + cf, static_cast(options.memtable_factory.get()) + ->GetLastColumnFamilyId()); + } +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_merge_operand_test.cc b/src/rocksdb/db/db_merge_operand_test.cc new file mode 100644 index 000000000..a0ab34e01 --- /dev/null +++ b/src/rocksdb/db/db_merge_operand_test.cc @@ -0,0 +1,240 @@ +// Copyright (c) 2018-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/utilities/debug.h" +#include "table/block_based/block_builder.h" +#include "test_util/fault_injection_test_env.h" +#if !defined(ROCKSDB_LITE) +#include "test_util/sync_point.h" +#endif +#include "rocksdb/merge_operator.h" +#include "utilities/merge_operators.h" +#include "utilities/merge_operators/sortlist.h" +#include "utilities/merge_operators/string_append/stringappend2.h" + +namespace ROCKSDB_NAMESPACE { + +class DBMergeOperandTest : public DBTestBase { + public: + DBMergeOperandTest() : DBTestBase("/db_merge_operand_test") {} +}; + +TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) { + class LimitedStringAppendMergeOp : public StringAppendTESTOperator { + public: + LimitedStringAppendMergeOp(int limit, char delim) + : StringAppendTESTOperator(delim), limit_(limit) {} + + const char* Name() const override { + return "DBMergeOperatorTest::LimitedStringAppendMergeOp"; + } + + bool ShouldMerge(const std::vector& operands) const override { + if (operands.size() > 0 && limit_ > 0 && operands.size() >= limit_) { + return true; + } + return false; + } + + private: + size_t limit_ = 0; + }; + + Options options; + options.create_if_missing = true; + // Use only the latest two merge operands. + options.merge_operator = std::make_shared(2, ','); + options.env = env_; + Reopen(options); + int num_records = 4; + int number_of_operands = 0; + std::vector values(num_records); + GetMergeOperandsOptions merge_operands_info; + merge_operands_info.expected_max_number_of_operands = num_records; + + // k0 value in memtable + Put("k0", "PutARock"); + db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k0", + values.data(), &merge_operands_info, + &number_of_operands); + ASSERT_EQ(values[0], "PutARock"); + + // k0.1 value in SST + Put("k0.1", "RockInSST"); + ASSERT_OK(Flush()); + db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k0.1", + values.data(), &merge_operands_info, + &number_of_operands); + ASSERT_EQ(values[0], "RockInSST"); + + // All k1 values are in memtable. + ASSERT_OK(Merge("k1", "a")); + Put("k1", "x"); + ASSERT_OK(Merge("k1", "b")); + ASSERT_OK(Merge("k1", "c")); + ASSERT_OK(Merge("k1", "d")); + db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", + values.data(), &merge_operands_info, + &number_of_operands); + ASSERT_EQ(values[0], "x"); + ASSERT_EQ(values[1], "b"); + ASSERT_EQ(values[2], "c"); + ASSERT_EQ(values[3], "d"); + + // expected_max_number_of_operands is less than number of merge operands so + // status should be Incomplete. + merge_operands_info.expected_max_number_of_operands = num_records - 1; + Status status = db_->GetMergeOperands( + ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), + &merge_operands_info, &number_of_operands); + ASSERT_EQ(status.IsIncomplete(), true); + merge_operands_info.expected_max_number_of_operands = num_records; + + // All k1.1 values are in memtable. + ASSERT_OK(Merge("k1.1", "r")); + Delete("k1.1"); + ASSERT_OK(Merge("k1.1", "c")); + ASSERT_OK(Merge("k1.1", "k")); + ASSERT_OK(Merge("k1.1", "s")); + db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1.1", + values.data(), &merge_operands_info, + &number_of_operands); + ASSERT_EQ(values[0], "c"); + ASSERT_EQ(values[1], "k"); + ASSERT_EQ(values[2], "s"); + + // All k2 values are flushed to L0 into a single file. + ASSERT_OK(Merge("k2", "q")); + ASSERT_OK(Merge("k2", "w")); + ASSERT_OK(Merge("k2", "e")); + ASSERT_OK(Merge("k2", "r")); + ASSERT_OK(Flush()); + db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k2", + values.data(), &merge_operands_info, + &number_of_operands); + ASSERT_EQ(values[0], "q"); + ASSERT_EQ(values[1], "w"); + ASSERT_EQ(values[2], "e"); + ASSERT_EQ(values[3], "r"); + + // All k2.1 values are flushed to L0 into a single file. + ASSERT_OK(Merge("k2.1", "m")); + Put("k2.1", "l"); + ASSERT_OK(Merge("k2.1", "n")); + ASSERT_OK(Merge("k2.1", "o")); + ASSERT_OK(Flush()); + db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k2.1", + values.data(), &merge_operands_info, + &number_of_operands); + ASSERT_EQ(values[0], "l,n,o"); + + // All k2.2 values are flushed to L0 into a single file. + ASSERT_OK(Merge("k2.2", "g")); + Delete("k2.2"); + ASSERT_OK(Merge("k2.2", "o")); + ASSERT_OK(Merge("k2.2", "t")); + ASSERT_OK(Flush()); + db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k2.2", + values.data(), &merge_operands_info, + &number_of_operands); + ASSERT_EQ(values[0], "o,t"); + + // Do some compaction that will make the following tests more predictable + // Slice start("PutARock"); + // Slice end("t"); + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + + // All k3 values are flushed and are in different files. + ASSERT_OK(Merge("k3", "ab")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3", "bc")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3", "cd")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3", "de")); + db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k3", + values.data(), &merge_operands_info, + &number_of_operands); + ASSERT_EQ(values[0], "ab"); + ASSERT_EQ(values[1], "bc"); + ASSERT_EQ(values[2], "cd"); + ASSERT_EQ(values[3], "de"); + + // All k3.1 values are flushed and are in different files. + ASSERT_OK(Merge("k3.1", "ab")); + ASSERT_OK(Flush()); + Put("k3.1", "bc"); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3.1", "cd")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3.1", "de")); + db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k3.1", + values.data(), &merge_operands_info, + &number_of_operands); + ASSERT_EQ(values[0], "bc"); + ASSERT_EQ(values[1], "cd"); + ASSERT_EQ(values[2], "de"); + + // All k3.2 values are flushed and are in different files. + ASSERT_OK(Merge("k3.2", "ab")); + ASSERT_OK(Flush()); + Delete("k3.2"); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3.2", "cd")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3.2", "de")); + db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k3.2", + values.data(), &merge_operands_info, + &number_of_operands); + ASSERT_EQ(values[0], "cd"); + ASSERT_EQ(values[1], "de"); + + // All K4 values are in different levels + ASSERT_OK(Merge("k4", "ba")); + ASSERT_OK(Flush()); + MoveFilesToLevel(4); + ASSERT_OK(Merge("k4", "cb")); + ASSERT_OK(Flush()); + MoveFilesToLevel(3); + ASSERT_OK(Merge("k4", "dc")); + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + ASSERT_OK(Merge("k4", "ed")); + db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k4", + values.data(), &merge_operands_info, + &number_of_operands); + ASSERT_EQ(values[0], "ba"); + ASSERT_EQ(values[1], "cb"); + ASSERT_EQ(values[2], "dc"); + ASSERT_EQ(values[3], "ed"); + + // First 3 k5 values are in SST and next 4 k5 values are in Immutable Memtable + ASSERT_OK(Merge("k5", "who")); + ASSERT_OK(Merge("k5", "am")); + ASSERT_OK(Merge("k5", "i")); + ASSERT_OK(Flush()); + Put("k5", "remember"); + ASSERT_OK(Merge("k5", "i")); + ASSERT_OK(Merge("k5", "am")); + ASSERT_OK(Merge("k5", "rocks")); + dbfull()->TEST_SwitchMemtable(); + db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k5", + values.data(), &merge_operands_info, + &number_of_operands); + ASSERT_EQ(values[0], "remember"); + ASSERT_EQ(values[1], "i"); + ASSERT_EQ(values[2], "am"); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_merge_operator_test.cc b/src/rocksdb/db/db_merge_operator_test.cc new file mode 100644 index 000000000..4f762468d --- /dev/null +++ b/src/rocksdb/db/db_merge_operator_test.cc @@ -0,0 +1,666 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#include +#include + +#include "db/db_test_util.h" +#include "db/forward_iterator.h" +#include "port/stack_trace.h" +#include "rocksdb/merge_operator.h" +#include "utilities/merge_operators.h" +#include "utilities/merge_operators/string_append/stringappend2.h" + +namespace ROCKSDB_NAMESPACE { + +class TestReadCallback : public ReadCallback { + public: + TestReadCallback(SnapshotChecker* snapshot_checker, + SequenceNumber snapshot_seq) + : ReadCallback(snapshot_seq), + snapshot_checker_(snapshot_checker), + snapshot_seq_(snapshot_seq) {} + + bool IsVisibleFullCheck(SequenceNumber seq) override { + return snapshot_checker_->CheckInSnapshot(seq, snapshot_seq_) == + SnapshotCheckerResult::kInSnapshot; + } + + private: + SnapshotChecker* snapshot_checker_; + SequenceNumber snapshot_seq_; +}; + +// Test merge operator functionality. +class DBMergeOperatorTest : public DBTestBase { + public: + DBMergeOperatorTest() : DBTestBase("/db_merge_operator_test") {} + + std::string GetWithReadCallback(SnapshotChecker* snapshot_checker, + const Slice& key, + const Snapshot* snapshot = nullptr) { + SequenceNumber seq = snapshot == nullptr ? db_->GetLatestSequenceNumber() + : snapshot->GetSequenceNumber(); + TestReadCallback read_callback(snapshot_checker, seq); + ReadOptions read_opt; + read_opt.snapshot = snapshot; + PinnableSlice value; + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = db_->DefaultColumnFamily(); + get_impl_options.value = &value; + get_impl_options.callback = &read_callback; + Status s = dbfull()->GetImpl(read_opt, key, get_impl_options); + if (!s.ok()) { + return s.ToString(); + } + return value.ToString(); + } +}; + +TEST_F(DBMergeOperatorTest, LimitMergeOperands) { + class LimitedStringAppendMergeOp : public StringAppendTESTOperator { + public: + LimitedStringAppendMergeOp(int limit, char delim) + : StringAppendTESTOperator(delim), limit_(limit) {} + + const char* Name() const override { + return "DBMergeOperatorTest::LimitedStringAppendMergeOp"; + } + + bool ShouldMerge(const std::vector& operands) const override { + if (operands.size() > 0 && limit_ > 0 && operands.size() >= limit_) { + return true; + } + return false; + } + + private: + size_t limit_ = 0; + }; + + Options options; + options.create_if_missing = true; + // Use only the latest two merge operands. + options.merge_operator = + std::make_shared(2, ','); + options.env = env_; + Reopen(options); + // All K1 values are in memtable. + ASSERT_OK(Merge("k1", "a")); + ASSERT_OK(Merge("k1", "b")); + ASSERT_OK(Merge("k1", "c")); + ASSERT_OK(Merge("k1", "d")); + std::string value; + ASSERT_TRUE(db_->Get(ReadOptions(), "k1", &value).ok()); + // Make sure that only the latest two merge operands are used. If this was + // not the case the value would be "a,b,c,d". + ASSERT_EQ(value, "c,d"); + + // All K2 values are flushed to L0 into a single file. + ASSERT_OK(Merge("k2", "a")); + ASSERT_OK(Merge("k2", "b")); + ASSERT_OK(Merge("k2", "c")); + ASSERT_OK(Merge("k2", "d")); + ASSERT_OK(Flush()); + ASSERT_TRUE(db_->Get(ReadOptions(), "k2", &value).ok()); + ASSERT_EQ(value, "c,d"); + + // All K3 values are flushed and are in different files. + ASSERT_OK(Merge("k3", "ab")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3", "bc")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3", "cd")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3", "de")); + ASSERT_TRUE(db_->Get(ReadOptions(), "k3", &value).ok()); + ASSERT_EQ(value, "cd,de"); + + // All K4 values are in different levels + ASSERT_OK(Merge("k4", "ab")); + ASSERT_OK(Flush()); + MoveFilesToLevel(4); + ASSERT_OK(Merge("k4", "bc")); + ASSERT_OK(Flush()); + MoveFilesToLevel(3); + ASSERT_OK(Merge("k4", "cd")); + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + ASSERT_OK(Merge("k4", "de")); + ASSERT_TRUE(db_->Get(ReadOptions(), "k4", &value).ok()); + ASSERT_EQ(value, "cd,de"); +} + +TEST_F(DBMergeOperatorTest, MergeErrorOnRead) { + Options options; + options.create_if_missing = true; + options.merge_operator.reset(new TestPutOperator()); + options.env = env_; + Reopen(options); + ASSERT_OK(Merge("k1", "v1")); + ASSERT_OK(Merge("k1", "corrupted")); + std::string value; + ASSERT_TRUE(db_->Get(ReadOptions(), "k1", &value).IsCorruption()); + VerifyDBInternal({{"k1", "corrupted"}, {"k1", "v1"}}); +} + +TEST_F(DBMergeOperatorTest, MergeErrorOnWrite) { + Options options; + options.create_if_missing = true; + options.merge_operator.reset(new TestPutOperator()); + options.max_successive_merges = 3; + options.env = env_; + Reopen(options); + ASSERT_OK(Merge("k1", "v1")); + ASSERT_OK(Merge("k1", "v2")); + // Will trigger a merge when hitting max_successive_merges and the merge + // will fail. The delta will be inserted nevertheless. + ASSERT_OK(Merge("k1", "corrupted")); + // Data should stay unmerged after the error. + VerifyDBInternal({{"k1", "corrupted"}, {"k1", "v2"}, {"k1", "v1"}}); +} + +TEST_F(DBMergeOperatorTest, MergeErrorOnIteration) { + Options options; + options.create_if_missing = true; + options.merge_operator.reset(new TestPutOperator()); + options.env = env_; + + DestroyAndReopen(options); + ASSERT_OK(Merge("k1", "v1")); + ASSERT_OK(Merge("k1", "corrupted")); + ASSERT_OK(Put("k2", "v2")); + auto* iter = db_->NewIterator(ReadOptions()); + iter->Seek("k1"); + ASSERT_FALSE(iter->Valid()); + ASSERT_TRUE(iter->status().IsCorruption()); + delete iter; + iter = db_->NewIterator(ReadOptions()); + iter->Seek("k2"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + iter->Prev(); + ASSERT_FALSE(iter->Valid()); + ASSERT_TRUE(iter->status().IsCorruption()); + delete iter; + VerifyDBInternal({{"k1", "corrupted"}, {"k1", "v1"}, {"k2", "v2"}}); + + DestroyAndReopen(options); + ASSERT_OK(Merge("k1", "v1")); + ASSERT_OK(Put("k2", "v2")); + ASSERT_OK(Merge("k2", "corrupted")); + iter = db_->NewIterator(ReadOptions()); + iter->Seek("k1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_TRUE(iter->status().IsCorruption()); + delete iter; + VerifyDBInternal({{"k1", "v1"}, {"k2", "corrupted"}, {"k2", "v2"}}); +} + + +class MergeOperatorPinningTest : public DBMergeOperatorTest, + public testing::WithParamInterface { + public: + MergeOperatorPinningTest() { disable_block_cache_ = GetParam(); } + + bool disable_block_cache_; +}; + +INSTANTIATE_TEST_CASE_P(MergeOperatorPinningTest, MergeOperatorPinningTest, + ::testing::Bool()); + +#ifndef ROCKSDB_LITE +TEST_P(MergeOperatorPinningTest, OperandsMultiBlocks) { + Options options = CurrentOptions(); + BlockBasedTableOptions table_options; + table_options.block_size = 1; // every block will contain one entry + table_options.no_block_cache = disable_block_cache_; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.merge_operator = MergeOperators::CreateStringAppendTESTOperator(); + options.level0_slowdown_writes_trigger = (1 << 30); + options.level0_stop_writes_trigger = (1 << 30); + options.disable_auto_compactions = true; + DestroyAndReopen(options); + + const int kKeysPerFile = 10; + const int kOperandsPerKeyPerFile = 7; + const int kOperandSize = 100; + // Filse to write in L0 before compacting to lower level + const int kFilesPerLevel = 3; + + Random rnd(301); + std::map true_data; + int batch_num = 1; + int lvl_to_fill = 4; + int key_id = 0; + while (true) { + for (int j = 0; j < kKeysPerFile; j++) { + std::string key = Key(key_id % 35); + key_id++; + for (int k = 0; k < kOperandsPerKeyPerFile; k++) { + std::string val = RandomString(&rnd, kOperandSize); + ASSERT_OK(db_->Merge(WriteOptions(), key, val)); + if (true_data[key].size() == 0) { + true_data[key] = val; + } else { + true_data[key] += "," + val; + } + } + } + + if (lvl_to_fill == -1) { + // Keep last batch in memtable and stop + break; + } + + ASSERT_OK(Flush()); + if (batch_num % kFilesPerLevel == 0) { + if (lvl_to_fill != 0) { + MoveFilesToLevel(lvl_to_fill); + } + lvl_to_fill--; + } + batch_num++; + } + + // 3 L0 files + // 1 L1 file + // 3 L2 files + // 1 L3 file + // 3 L4 Files + ASSERT_EQ(FilesPerLevel(), "3,1,3,1,3"); + + VerifyDBFromMap(true_data); +} + +class MergeOperatorHook : public MergeOperator { + public: + explicit MergeOperatorHook(std::shared_ptr _merge_op) + : merge_op_(_merge_op) {} + + bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override { + before_merge_(); + bool res = merge_op_->FullMergeV2(merge_in, merge_out); + after_merge_(); + return res; + } + + const char* Name() const override { return merge_op_->Name(); } + + std::shared_ptr merge_op_; + std::function before_merge_ = []() {}; + std::function after_merge_ = []() {}; +}; + +TEST_P(MergeOperatorPinningTest, EvictCacheBeforeMerge) { + Options options = CurrentOptions(); + + auto merge_hook = + std::make_shared(MergeOperators::CreateMaxOperator()); + options.merge_operator = merge_hook; + options.disable_auto_compactions = true; + options.level0_slowdown_writes_trigger = (1 << 30); + options.level0_stop_writes_trigger = (1 << 30); + options.max_open_files = 20; + BlockBasedTableOptions bbto; + bbto.no_block_cache = disable_block_cache_; + if (bbto.no_block_cache == false) { + bbto.block_cache = NewLRUCache(64 * 1024 * 1024); + } else { + bbto.block_cache = nullptr; + } + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + const int kNumOperands = 30; + const int kNumKeys = 1000; + const int kOperandSize = 100; + Random rnd(301); + + // 1000 keys every key have 30 operands, every operand is in a different file + std::map true_data; + for (int i = 0; i < kNumOperands; i++) { + for (int j = 0; j < kNumKeys; j++) { + std::string k = Key(j); + std::string v = RandomString(&rnd, kOperandSize); + ASSERT_OK(db_->Merge(WriteOptions(), k, v)); + + true_data[k] = std::max(true_data[k], v); + } + ASSERT_OK(Flush()); + } + + std::vector file_numbers = ListTableFiles(env_, dbname_); + ASSERT_EQ(file_numbers.size(), kNumOperands); + int merge_cnt = 0; + + // Code executed before merge operation + merge_hook->before_merge_ = [&]() { + // Evict all tables from cache before every merge operation + for (uint64_t num : file_numbers) { + TableCache::Evict(dbfull()->TEST_table_cache(), num); + } + // Decrease cache capacity to force all unrefed blocks to be evicted + if (bbto.block_cache) { + bbto.block_cache->SetCapacity(1); + } + merge_cnt++; + }; + + // Code executed after merge operation + merge_hook->after_merge_ = [&]() { + // Increase capacity again after doing the merge + if (bbto.block_cache) { + bbto.block_cache->SetCapacity(64 * 1024 * 1024); + } + }; + + size_t total_reads; + VerifyDBFromMap(true_data, &total_reads); + ASSERT_EQ(merge_cnt, total_reads); + + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + + VerifyDBFromMap(true_data, &total_reads); +} + +TEST_P(MergeOperatorPinningTest, TailingIterator) { + Options options = CurrentOptions(); + options.merge_operator = MergeOperators::CreateMaxOperator(); + BlockBasedTableOptions bbto; + bbto.no_block_cache = disable_block_cache_; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + const int kNumOperands = 100; + const int kNumWrites = 100000; + + std::function writer_func = [&]() { + int k = 0; + for (int i = 0; i < kNumWrites; i++) { + db_->Merge(WriteOptions(), Key(k), Key(k)); + + if (i && i % kNumOperands == 0) { + k++; + } + if (i && i % 127 == 0) { + ASSERT_OK(Flush()); + } + if (i && i % 317 == 0) { + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + } + } + }; + + std::function reader_func = [&]() { + ReadOptions ro; + ro.tailing = true; + Iterator* iter = db_->NewIterator(ro); + + iter->SeekToFirst(); + for (int i = 0; i < (kNumWrites / kNumOperands); i++) { + while (!iter->Valid()) { + // wait for the key to be written + env_->SleepForMicroseconds(100); + iter->Seek(Key(i)); + } + ASSERT_EQ(iter->key(), Key(i)); + ASSERT_EQ(iter->value(), Key(i)); + + iter->Next(); + } + + delete iter; + }; + + ROCKSDB_NAMESPACE::port::Thread writer_thread(writer_func); + ROCKSDB_NAMESPACE::port::Thread reader_thread(reader_func); + + writer_thread.join(); + reader_thread.join(); +} + +TEST_F(DBMergeOperatorTest, TailingIteratorMemtableUnrefedBySomeoneElse) { + Options options = CurrentOptions(); + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + DestroyAndReopen(options); + + // Overview of the test: + // * There are two merge operands for the same key: one in an sst file, + // another in a memtable. + // * Seek a tailing iterator to this key. + // * As part of the seek, the iterator will: + // (a) first visit the operand in the memtable and tell ForwardIterator + // to pin this operand, then + // (b) move on to the operand in the sst file, then pass both operands + // to merge operator. + // * The memtable may get flushed and unreferenced by another thread between + // (a) and (b). The test simulates it by flushing the memtable inside a + // SyncPoint callback located between (a) and (b). + // * In this case it's ForwardIterator's responsibility to keep the memtable + // pinned until (b) is complete. There used to be a bug causing + // ForwardIterator to not pin it in some circumstances. This test + // reproduces it. + + db_->Merge(WriteOptions(), "key", "sst"); + db_->Flush(FlushOptions()); // Switch to SuperVersion A + db_->Merge(WriteOptions(), "key", "memtable"); + + // Pin SuperVersion A + std::unique_ptr someone_else(db_->NewIterator(ReadOptions())); + + bool pushed_first_operand = false; + bool stepped_to_next_operand = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBIter::MergeValuesNewToOld:PushedFirstOperand", [&](void*) { + EXPECT_FALSE(pushed_first_operand); + pushed_first_operand = true; + db_->Flush(FlushOptions()); // Switch to SuperVersion B + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBIter::MergeValuesNewToOld:SteppedToNextOperand", [&](void*) { + EXPECT_FALSE(stepped_to_next_operand); + stepped_to_next_operand = true; + someone_else.reset(); // Unpin SuperVersion A + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ReadOptions ro; + ro.tailing = true; + std::unique_ptr iter(db_->NewIterator(ro)); + iter->Seek("key"); + + ASSERT_TRUE(iter->status().ok()); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(std::string("sst,memtable"), iter->value().ToString()); + EXPECT_TRUE(pushed_first_operand); + EXPECT_TRUE(stepped_to_next_operand); +} +#endif // ROCKSDB_LITE + +TEST_F(DBMergeOperatorTest, SnapshotCheckerAndReadCallback) { + Options options = CurrentOptions(); + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + DestroyAndReopen(options); + + class TestSnapshotChecker : public SnapshotChecker { + public: + SnapshotCheckerResult CheckInSnapshot( + SequenceNumber seq, SequenceNumber snapshot_seq) const override { + return IsInSnapshot(seq, snapshot_seq) + ? SnapshotCheckerResult::kInSnapshot + : SnapshotCheckerResult::kNotInSnapshot; + } + + bool IsInSnapshot(SequenceNumber seq, SequenceNumber snapshot_seq) const { + switch (snapshot_seq) { + case 0: + return seq == 0; + case 1: + return seq <= 1; + case 2: + // seq = 2 not visible to snapshot with seq = 2 + return seq <= 1; + case 3: + return seq <= 3; + case 4: + // seq = 4 not visible to snpahost with seq = 4 + return seq <= 3; + default: + // seq >=4 is uncommitted + return seq <= 4; + }; + } + }; + TestSnapshotChecker* snapshot_checker = new TestSnapshotChecker(); + dbfull()->SetSnapshotChecker(snapshot_checker); + + std::string value; + ASSERT_OK(Merge("foo", "v1")); + ASSERT_EQ(1, db_->GetLatestSequenceNumber()); + ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo")); + ASSERT_OK(Merge("foo", "v2")); + ASSERT_EQ(2, db_->GetLatestSequenceNumber()); + // v2 is not visible to latest snapshot, which has seq = 2. + ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo")); + // Take a snapshot with seq = 2. + const Snapshot* snapshot1 = db_->GetSnapshot(); + ASSERT_EQ(2, snapshot1->GetSequenceNumber()); + // v2 is not visible to snapshot1, which has seq = 2 + ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo", snapshot1)); + + // Verify flush doesn't alter the result. + ASSERT_OK(Flush()); + ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo", snapshot1)); + ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo")); + + ASSERT_OK(Merge("foo", "v3")); + ASSERT_EQ(3, db_->GetLatestSequenceNumber()); + ASSERT_EQ("v1,v2,v3", GetWithReadCallback(snapshot_checker, "foo")); + ASSERT_OK(Merge("foo", "v4")); + ASSERT_EQ(4, db_->GetLatestSequenceNumber()); + // v4 is not visible to latest snapshot, which has seq = 4. + ASSERT_EQ("v1,v2,v3", GetWithReadCallback(snapshot_checker, "foo")); + const Snapshot* snapshot2 = db_->GetSnapshot(); + ASSERT_EQ(4, snapshot2->GetSequenceNumber()); + // v4 is not visible to snapshot2, which has seq = 4. + ASSERT_EQ("v1,v2,v3", + GetWithReadCallback(snapshot_checker, "foo", snapshot2)); + + // Verify flush doesn't alter the result. + ASSERT_OK(Flush()); + ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo", snapshot1)); + ASSERT_EQ("v1,v2,v3", + GetWithReadCallback(snapshot_checker, "foo", snapshot2)); + ASSERT_EQ("v1,v2,v3", GetWithReadCallback(snapshot_checker, "foo")); + + ASSERT_OK(Merge("foo", "v5")); + ASSERT_EQ(5, db_->GetLatestSequenceNumber()); + // v5 is uncommitted + ASSERT_EQ("v1,v2,v3,v4", GetWithReadCallback(snapshot_checker, "foo")); + + // full manual compaction. + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + // Verify compaction doesn't alter the result. + ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo", snapshot1)); + ASSERT_EQ("v1,v2,v3", + GetWithReadCallback(snapshot_checker, "foo", snapshot2)); + ASSERT_EQ("v1,v2,v3,v4", GetWithReadCallback(snapshot_checker, "foo")); + + db_->ReleaseSnapshot(snapshot1); + db_->ReleaseSnapshot(snapshot2); +} + +class PerConfigMergeOperatorPinningTest + : public DBMergeOperatorTest, + public testing::WithParamInterface> { + public: + PerConfigMergeOperatorPinningTest() { + std::tie(disable_block_cache_, option_config_) = GetParam(); + } + + bool disable_block_cache_; +}; + +INSTANTIATE_TEST_CASE_P( + MergeOperatorPinningTest, PerConfigMergeOperatorPinningTest, + ::testing::Combine(::testing::Bool(), + ::testing::Range(static_cast(DBTestBase::kDefault), + static_cast(DBTestBase::kEnd)))); + +TEST_P(PerConfigMergeOperatorPinningTest, Randomized) { + if (ShouldSkipOptions(option_config_, kSkipMergePut)) { + return; + } + + Options options = CurrentOptions(); + options.merge_operator = MergeOperators::CreateMaxOperator(); + BlockBasedTableOptions table_options; + table_options.no_block_cache = disable_block_cache_; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + Random rnd(301); + std::map true_data; + + const int kTotalMerges = 5000; + // Every key gets ~10 operands + const int kKeyRange = kTotalMerges / 10; + const int kOperandSize = 20; + const int kNumPutBefore = kKeyRange / 10; // 10% value + const int kNumPutAfter = kKeyRange / 10; // 10% overwrite + const int kNumDelete = kKeyRange / 10; // 10% delete + + // kNumPutBefore keys will have base values + for (int i = 0; i < kNumPutBefore; i++) { + std::string key = Key(rnd.Next() % kKeyRange); + std::string value = RandomString(&rnd, kOperandSize); + ASSERT_OK(db_->Put(WriteOptions(), key, value)); + + true_data[key] = value; + } + + // Do kTotalMerges merges + for (int i = 0; i < kTotalMerges; i++) { + std::string key = Key(rnd.Next() % kKeyRange); + std::string value = RandomString(&rnd, kOperandSize); + ASSERT_OK(db_->Merge(WriteOptions(), key, value)); + + if (true_data[key] < value) { + true_data[key] = value; + } + } + + // Overwrite random kNumPutAfter keys + for (int i = 0; i < kNumPutAfter; i++) { + std::string key = Key(rnd.Next() % kKeyRange); + std::string value = RandomString(&rnd, kOperandSize); + ASSERT_OK(db_->Put(WriteOptions(), key, value)); + + true_data[key] = value; + } + + // Delete random kNumDelete keys + for (int i = 0; i < kNumDelete; i++) { + std::string key = Key(rnd.Next() % kKeyRange); + ASSERT_OK(db_->Delete(WriteOptions(), key)); + + true_data.erase(key); + } + + VerifyDBFromMap(true_data); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_options_test.cc b/src/rocksdb/db/db_options_test.cc new file mode 100644 index 000000000..383f66cbf --- /dev/null +++ b/src/rocksdb/db/db_options_test.cc @@ -0,0 +1,870 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include +#include +#include + +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" +#include "options/options_helper.h" +#include "port/stack_trace.h" +#include "rocksdb/cache.h" +#include "rocksdb/convenience.h" +#include "rocksdb/rate_limiter.h" +#include "rocksdb/stats_history.h" +#include "test_util/sync_point.h" +#include "test_util/testutil.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +class DBOptionsTest : public DBTestBase { + public: + DBOptionsTest() : DBTestBase("/db_options_test") {} + +#ifndef ROCKSDB_LITE + std::unordered_map GetMutableDBOptionsMap( + const DBOptions& options) { + std::string options_str; + GetStringFromDBOptions(&options_str, options); + std::unordered_map options_map; + StringToMap(options_str, &options_map); + std::unordered_map mutable_map; + for (const auto opt : db_options_type_info) { + if (opt.second.is_mutable && + opt.second.verification != OptionVerificationType::kDeprecated) { + mutable_map[opt.first] = options_map[opt.first]; + } + } + return mutable_map; + } + + std::unordered_map GetMutableCFOptionsMap( + const ColumnFamilyOptions& options) { + std::string options_str; + GetStringFromColumnFamilyOptions(&options_str, options); + std::unordered_map options_map; + StringToMap(options_str, &options_map); + std::unordered_map mutable_map; + for (const auto opt : cf_options_type_info) { + if (opt.second.is_mutable && + opt.second.verification != OptionVerificationType::kDeprecated) { + mutable_map[opt.first] = options_map[opt.first]; + } + } + return mutable_map; + } + + std::unordered_map GetRandomizedMutableCFOptionsMap( + Random* rnd) { + Options options = CurrentOptions(); + options.env = env_; + ImmutableDBOptions db_options(options); + test::RandomInitCFOptions(&options, options, rnd); + auto sanitized_options = SanitizeOptions(db_options, options); + auto opt_map = GetMutableCFOptionsMap(sanitized_options); + delete options.compaction_filter; + return opt_map; + } + + std::unordered_map GetRandomizedMutableDBOptionsMap( + Random* rnd) { + DBOptions db_options; + test::RandomInitDBOptions(&db_options, rnd); + auto sanitized_options = SanitizeOptions(dbname_, db_options); + return GetMutableDBOptionsMap(sanitized_options); + } +#endif // ROCKSDB_LITE +}; + +// RocksDB lite don't support dynamic options. +#ifndef ROCKSDB_LITE + +TEST_F(DBOptionsTest, GetLatestDBOptions) { + // GetOptions should be able to get latest option changed by SetOptions. + Options options; + options.create_if_missing = true; + options.env = env_; + Random rnd(228); + Reopen(options); + auto new_options = GetRandomizedMutableDBOptionsMap(&rnd); + ASSERT_OK(dbfull()->SetDBOptions(new_options)); + ASSERT_EQ(new_options, GetMutableDBOptionsMap(dbfull()->GetDBOptions())); +} + +TEST_F(DBOptionsTest, GetLatestCFOptions) { + // GetOptions should be able to get latest option changed by SetOptions. + Options options; + options.create_if_missing = true; + options.env = env_; + Random rnd(228); + Reopen(options); + CreateColumnFamilies({"foo"}, options); + ReopenWithColumnFamilies({"default", "foo"}, options); + auto options_default = GetRandomizedMutableCFOptionsMap(&rnd); + auto options_foo = GetRandomizedMutableCFOptionsMap(&rnd); + ASSERT_OK(dbfull()->SetOptions(handles_[0], options_default)); + ASSERT_OK(dbfull()->SetOptions(handles_[1], options_foo)); + ASSERT_EQ(options_default, + GetMutableCFOptionsMap(dbfull()->GetOptions(handles_[0]))); + ASSERT_EQ(options_foo, + GetMutableCFOptionsMap(dbfull()->GetOptions(handles_[1]))); +} + +TEST_F(DBOptionsTest, SetBytesPerSync) { + const size_t kValueSize = 1024 * 1024; // 1MB + Options options; + options.create_if_missing = true; + options.bytes_per_sync = 1024 * 1024; + options.use_direct_reads = false; + options.write_buffer_size = 400 * kValueSize; + options.disable_auto_compactions = true; + options.compression = kNoCompression; + options.env = env_; + Reopen(options); + int counter = 0; + int low_bytes_per_sync = 0; + int i = 0; + const std::string kValue(kValueSize, 'v'); + ASSERT_EQ(options.bytes_per_sync, dbfull()->GetDBOptions().bytes_per_sync); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WritableFileWriter::RangeSync:0", [&](void* /*arg*/) { counter++; }); + + WriteOptions write_opts; + // should sync approximately 40MB/1MB ~= 40 times. + for (i = 0; i < 40; i++) { + Put(Key(i), kValue, write_opts); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + low_bytes_per_sync = counter; + ASSERT_GT(low_bytes_per_sync, 35); + ASSERT_LT(low_bytes_per_sync, 45); + + counter = 0; + // 8388608 = 8 * 1024 * 1024 + ASSERT_OK(dbfull()->SetDBOptions({{"bytes_per_sync", "8388608"}})); + ASSERT_EQ(8388608, dbfull()->GetDBOptions().bytes_per_sync); + // should sync approximately 40MB*2/8MB ~= 10 times. + // data will be 40*2MB because of previous Puts too. + for (i = 0; i < 40; i++) { + Put(Key(i), kValue, write_opts); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_GT(counter, 5); + ASSERT_LT(counter, 15); + + // Redundant assert. But leaving it here just to get the point across that + // low_bytes_per_sync > counter. + ASSERT_GT(low_bytes_per_sync, counter); +} + +TEST_F(DBOptionsTest, SetWalBytesPerSync) { + const size_t kValueSize = 1024 * 1024 * 3; + Options options; + options.create_if_missing = true; + options.wal_bytes_per_sync = 512; + options.write_buffer_size = 100 * kValueSize; + options.disable_auto_compactions = true; + options.compression = kNoCompression; + options.env = env_; + Reopen(options); + ASSERT_EQ(512, dbfull()->GetDBOptions().wal_bytes_per_sync); + int counter = 0; + int low_bytes_per_sync = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WritableFileWriter::RangeSync:0", [&](void* /*arg*/) { counter++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + const std::string kValue(kValueSize, 'v'); + int i = 0; + for (; i < 10; i++) { + Put(Key(i), kValue); + } + // Do not flush. If we flush here, SwitchWAL will reuse old WAL file since its + // empty and will not get the new wal_bytes_per_sync value. + low_bytes_per_sync = counter; + //5242880 = 1024 * 1024 * 5 + ASSERT_OK(dbfull()->SetDBOptions({{"wal_bytes_per_sync", "5242880"}})); + ASSERT_EQ(5242880, dbfull()->GetDBOptions().wal_bytes_per_sync); + counter = 0; + i = 0; + for (; i < 10; i++) { + Put(Key(i), kValue); + } + ASSERT_GT(counter, 0); + ASSERT_GT(low_bytes_per_sync, 0); + ASSERT_GT(low_bytes_per_sync, counter); +} + +TEST_F(DBOptionsTest, WritableFileMaxBufferSize) { + Options options; + options.create_if_missing = true; + options.writable_file_max_buffer_size = 1024 * 1024; + options.level0_file_num_compaction_trigger = 3; + options.max_manifest_file_size = 1; + options.env = env_; + int buffer_size = 1024 * 1024; + Reopen(options); + ASSERT_EQ(buffer_size, + dbfull()->GetDBOptions().writable_file_max_buffer_size); + + std::atomic match_cnt(0); + std::atomic unmatch_cnt(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WritableFileWriter::WritableFileWriter:0", [&](void* arg) { + int value = static_cast(reinterpret_cast(arg)); + if (value == buffer_size) { + match_cnt++; + } else { + unmatch_cnt++; + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + int i = 0; + for (; i < 3; i++) { + ASSERT_OK(Put("foo", ToString(i))); + ASSERT_OK(Put("bar", ToString(i))); + Flush(); + } + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(unmatch_cnt, 0); + ASSERT_GE(match_cnt, 11); + + ASSERT_OK( + dbfull()->SetDBOptions({{"writable_file_max_buffer_size", "524288"}})); + buffer_size = 512 * 1024; + match_cnt = 0; + unmatch_cnt = 0; // SetDBOptions() will create a WriteableFileWriter + + ASSERT_EQ(buffer_size, + dbfull()->GetDBOptions().writable_file_max_buffer_size); + i = 0; + for (; i < 3; i++) { + ASSERT_OK(Put("foo", ToString(i))); + ASSERT_OK(Put("bar", ToString(i))); + Flush(); + } + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(unmatch_cnt, 0); + ASSERT_GE(match_cnt, 11); +} + +TEST_F(DBOptionsTest, SetOptionsAndReopen) { + Random rnd(1044); + auto rand_opts = GetRandomizedMutableCFOptionsMap(&rnd); + ASSERT_OK(dbfull()->SetOptions(rand_opts)); + // Verify if DB can be reopen after setting options. + Options options; + options.env = env_; + ASSERT_OK(TryReopen(options)); +} + +TEST_F(DBOptionsTest, EnableAutoCompactionAndTriggerStall) { + const std::string kValue(1024, 'v'); + for (int method_type = 0; method_type < 2; method_type++) { + for (int option_type = 0; option_type < 4; option_type++) { + Options options; + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.write_buffer_size = 1024 * 1024 * 10; + options.compression = CompressionType::kNoCompression; + options.level0_file_num_compaction_trigger = 1; + options.level0_stop_writes_trigger = std::numeric_limits::max(); + options.level0_slowdown_writes_trigger = std::numeric_limits::max(); + options.hard_pending_compaction_bytes_limit = + std::numeric_limits::max(); + options.soft_pending_compaction_bytes_limit = + std::numeric_limits::max(); + options.env = env_; + + DestroyAndReopen(options); + int i = 0; + for (; i < 1024; i++) { + Put(Key(i), kValue); + } + Flush(); + for (; i < 1024 * 2; i++) { + Put(Key(i), kValue); + } + Flush(); + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(2, NumTableFilesAtLevel(0)); + uint64_t l0_size = SizeAtLevel(0); + + switch (option_type) { + case 0: + // test with level0_stop_writes_trigger + options.level0_stop_writes_trigger = 2; + options.level0_slowdown_writes_trigger = 2; + break; + case 1: + options.level0_slowdown_writes_trigger = 2; + break; + case 2: + options.hard_pending_compaction_bytes_limit = l0_size; + options.soft_pending_compaction_bytes_limit = l0_size; + break; + case 3: + options.soft_pending_compaction_bytes_limit = l0_size; + break; + } + Reopen(options); + dbfull()->TEST_WaitForCompact(); + ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped()); + ASSERT_FALSE(dbfull()->TEST_write_controler().NeedsDelay()); + + SyncPoint::GetInstance()->LoadDependency( + {{"DBOptionsTest::EnableAutoCompactionAndTriggerStall:1", + "BackgroundCallCompaction:0"}, + {"DBImpl::BackgroundCompaction():BeforePickCompaction", + "DBOptionsTest::EnableAutoCompactionAndTriggerStall:2"}, + {"DBOptionsTest::EnableAutoCompactionAndTriggerStall:3", + "DBImpl::BackgroundCompaction():AfterPickCompaction"}}); + // Block background compaction. + SyncPoint::GetInstance()->EnableProcessing(); + + switch (method_type) { + case 0: + ASSERT_OK( + dbfull()->SetOptions({{"disable_auto_compactions", "false"}})); + break; + case 1: + ASSERT_OK(dbfull()->EnableAutoCompaction( + {dbfull()->DefaultColumnFamily()})); + break; + } + TEST_SYNC_POINT("DBOptionsTest::EnableAutoCompactionAndTriggerStall:1"); + // Wait for stall condition recalculate. + TEST_SYNC_POINT("DBOptionsTest::EnableAutoCompactionAndTriggerStall:2"); + + switch (option_type) { + case 0: + ASSERT_TRUE(dbfull()->TEST_write_controler().IsStopped()); + break; + case 1: + ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + break; + case 2: + ASSERT_TRUE(dbfull()->TEST_write_controler().IsStopped()); + break; + case 3: + ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + break; + } + TEST_SYNC_POINT("DBOptionsTest::EnableAutoCompactionAndTriggerStall:3"); + + // Background compaction executed. + dbfull()->TEST_WaitForCompact(); + ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped()); + ASSERT_FALSE(dbfull()->TEST_write_controler().NeedsDelay()); + } + } +} + +TEST_F(DBOptionsTest, SetOptionsMayTriggerCompaction) { + Options options; + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = 1000; + options.env = env_; + Reopen(options); + for (int i = 0; i < 3; i++) { + // Need to insert two keys to avoid trivial move. + ASSERT_OK(Put("foo", ToString(i))); + ASSERT_OK(Put("bar", ToString(i))); + Flush(); + } + ASSERT_EQ("3", FilesPerLevel()); + ASSERT_OK( + dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "3"}})); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ("0,1", FilesPerLevel()); +} + +TEST_F(DBOptionsTest, SetBackgroundCompactionThreads) { + Options options; + options.create_if_missing = true; + options.max_background_compactions = 1; // default value + options.env = env_; + Reopen(options); + ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); + ASSERT_OK(dbfull()->SetDBOptions({{"max_background_compactions", "3"}})); + ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); + auto stop_token = dbfull()->TEST_write_controler().GetStopToken(); + ASSERT_EQ(3, dbfull()->TEST_BGCompactionsAllowed()); +} + +TEST_F(DBOptionsTest, SetBackgroundJobs) { + Options options; + options.create_if_missing = true; + options.max_background_jobs = 8; + options.env = env_; + Reopen(options); + + for (int i = 0; i < 2; ++i) { + if (i > 0) { + options.max_background_jobs = 12; + ASSERT_OK(dbfull()->SetDBOptions( + {{"max_background_jobs", + std::to_string(options.max_background_jobs)}})); + } + + const int expected_max_flushes = options.max_background_jobs / 4; + + ASSERT_EQ(expected_max_flushes, dbfull()->TEST_BGFlushesAllowed()); + ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); + + auto stop_token = dbfull()->TEST_write_controler().GetStopToken(); + + const int expected_max_compactions = 3 * expected_max_flushes; + + ASSERT_EQ(expected_max_flushes, dbfull()->TEST_BGFlushesAllowed()); + ASSERT_EQ(expected_max_compactions, dbfull()->TEST_BGCompactionsAllowed()); + + ASSERT_EQ(expected_max_flushes, + env_->GetBackgroundThreads(Env::Priority::HIGH)); + ASSERT_EQ(expected_max_compactions, + env_->GetBackgroundThreads(Env::Priority::LOW)); + } +} + +TEST_F(DBOptionsTest, AvoidFlushDuringShutdown) { + Options options; + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.env = env_; + WriteOptions write_without_wal; + write_without_wal.disableWAL = true; + + ASSERT_FALSE(options.avoid_flush_during_shutdown); + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "v1", write_without_wal)); + Reopen(options); + ASSERT_EQ("v1", Get("foo")); + ASSERT_EQ("1", FilesPerLevel()); + + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "v2", write_without_wal)); + ASSERT_OK(dbfull()->SetDBOptions({{"avoid_flush_during_shutdown", "true"}})); + Reopen(options); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ("", FilesPerLevel()); +} + +TEST_F(DBOptionsTest, SetDelayedWriteRateOption) { + Options options; + options.create_if_missing = true; + options.delayed_write_rate = 2 * 1024U * 1024U; + options.env = env_; + Reopen(options); + ASSERT_EQ(2 * 1024U * 1024U, dbfull()->TEST_write_controler().max_delayed_write_rate()); + + ASSERT_OK(dbfull()->SetDBOptions({{"delayed_write_rate", "20000"}})); + ASSERT_EQ(20000, dbfull()->TEST_write_controler().max_delayed_write_rate()); +} + +TEST_F(DBOptionsTest, MaxTotalWalSizeChange) { + Random rnd(1044); + const auto value_size = size_t(1024); + std::string value; + test::RandomString(&rnd, value_size, &value); + + Options options; + options.create_if_missing = true; + options.env = env_; + CreateColumnFamilies({"1", "2", "3"}, options); + ReopenWithColumnFamilies({"default", "1", "2", "3"}, options); + + WriteOptions write_options; + + const int key_count = 100; + for (int i = 0; i < key_count; ++i) { + for (size_t cf = 0; cf < handles_.size(); ++cf) { + ASSERT_OK(Put(static_cast(cf), Key(i), value)); + } + } + ASSERT_OK(dbfull()->SetDBOptions({{"max_total_wal_size", "10"}})); + + for (size_t cf = 0; cf < handles_.size(); ++cf) { + dbfull()->TEST_WaitForFlushMemTable(handles_[cf]); + ASSERT_EQ("1", FilesPerLevel(static_cast(cf))); + } +} + +TEST_F(DBOptionsTest, SetStatsDumpPeriodSec) { + Options options; + options.create_if_missing = true; + options.stats_dump_period_sec = 5; + options.env = env_; + Reopen(options); + ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_dump_period_sec); + + for (int i = 0; i < 20; i++) { + unsigned int num = rand() % 5000 + 1; + ASSERT_OK( + dbfull()->SetDBOptions({{"stats_dump_period_sec", ToString(num)}})); + ASSERT_EQ(num, dbfull()->GetDBOptions().stats_dump_period_sec); + } + Close(); +} + +TEST_F(DBOptionsTest, SetOptionsStatsPersistPeriodSec) { + Options options; + options.create_if_missing = true; + options.stats_persist_period_sec = 5; + options.env = env_; + Reopen(options); + ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_persist_period_sec); + + ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "12345"}})); + ASSERT_EQ(12345u, dbfull()->GetDBOptions().stats_persist_period_sec); + ASSERT_NOK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "abcde"}})); + ASSERT_EQ(12345u, dbfull()->GetDBOptions().stats_persist_period_sec); +} + +static void assert_candidate_files_empty(DBImpl* dbfull, const bool empty) { + dbfull->TEST_LockMutex(); + JobContext job_context(0); + dbfull->FindObsoleteFiles(&job_context, false); + ASSERT_EQ(empty, job_context.full_scan_candidate_files.empty()); + dbfull->TEST_UnlockMutex(); + if (job_context.HaveSomethingToDelete()) { + // fulfill the contract of FindObsoleteFiles by calling PurgeObsoleteFiles + // afterwards; otherwise the test may hang on shutdown + dbfull->PurgeObsoleteFiles(job_context); + } + job_context.Clean(); +} + +TEST_F(DBOptionsTest, DeleteObsoleteFilesPeriodChange) { + SpecialEnv env(env_); + env.time_elapse_only_sleep_ = true; + Options options; + options.env = &env; + options.create_if_missing = true; + ASSERT_OK(TryReopen(options)); + + // Verify that candidate files set is empty when no full scan requested. + assert_candidate_files_empty(dbfull(), true); + + ASSERT_OK( + dbfull()->SetDBOptions({{"delete_obsolete_files_period_micros", "0"}})); + + // After delete_obsolete_files_period_micros updated to 0, the next call + // to FindObsoleteFiles should make a full scan + assert_candidate_files_empty(dbfull(), false); + + ASSERT_OK( + dbfull()->SetDBOptions({{"delete_obsolete_files_period_micros", "20"}})); + + assert_candidate_files_empty(dbfull(), true); + + env.addon_time_.store(20); + assert_candidate_files_empty(dbfull(), true); + + env.addon_time_.store(21); + assert_candidate_files_empty(dbfull(), false); + + Close(); +} + +TEST_F(DBOptionsTest, MaxOpenFilesChange) { + SpecialEnv env(env_); + Options options; + options.env = CurrentOptions().env; + options.max_open_files = -1; + + Reopen(options); + + Cache* tc = dbfull()->TEST_table_cache(); + + ASSERT_EQ(-1, dbfull()->GetDBOptions().max_open_files); + ASSERT_LT(2000, tc->GetCapacity()); + ASSERT_OK(dbfull()->SetDBOptions({{"max_open_files", "1024"}})); + ASSERT_EQ(1024, dbfull()->GetDBOptions().max_open_files); + // examine the table cache (actual size should be 1014) + ASSERT_GT(1500, tc->GetCapacity()); + Close(); +} + +TEST_F(DBOptionsTest, SanitizeDelayedWriteRate) { + Options options; + options.delayed_write_rate = 0; + Reopen(options); + ASSERT_EQ(16 * 1024 * 1024, dbfull()->GetDBOptions().delayed_write_rate); + + options.rate_limiter.reset(NewGenericRateLimiter(31 * 1024 * 1024)); + Reopen(options); + ASSERT_EQ(31 * 1024 * 1024, dbfull()->GetDBOptions().delayed_write_rate); +} + +TEST_F(DBOptionsTest, SanitizeUniversalTTLCompaction) { + Options options; + options.compaction_style = kCompactionStyleUniversal; + + options.ttl = 0; + options.periodic_compaction_seconds = 0; + Reopen(options); + ASSERT_EQ(0, dbfull()->GetOptions().ttl); + ASSERT_EQ(0, dbfull()->GetOptions().periodic_compaction_seconds); + + options.ttl = 0; + options.periodic_compaction_seconds = 100; + Reopen(options); + ASSERT_EQ(0, dbfull()->GetOptions().ttl); + ASSERT_EQ(100, dbfull()->GetOptions().periodic_compaction_seconds); + + options.ttl = 100; + options.periodic_compaction_seconds = 0; + Reopen(options); + ASSERT_EQ(100, dbfull()->GetOptions().ttl); + ASSERT_EQ(100, dbfull()->GetOptions().periodic_compaction_seconds); + + options.ttl = 100; + options.periodic_compaction_seconds = 500; + Reopen(options); + ASSERT_EQ(100, dbfull()->GetOptions().ttl); + ASSERT_EQ(100, dbfull()->GetOptions().periodic_compaction_seconds); +} + +TEST_F(DBOptionsTest, SanitizeTtlDefault) { + Options options; + Reopen(options); + ASSERT_EQ(30 * 24 * 60 * 60, dbfull()->GetOptions().ttl); + + options.compaction_style = kCompactionStyleLevel; + options.ttl = 0; + Reopen(options); + ASSERT_EQ(0, dbfull()->GetOptions().ttl); + + options.ttl = 100; + Reopen(options); + ASSERT_EQ(100, dbfull()->GetOptions().ttl); +} + +TEST_F(DBOptionsTest, SanitizeFIFOPeriodicCompaction) { + Options options; + options.compaction_style = kCompactionStyleFIFO; + options.ttl = 0; + Reopen(options); + ASSERT_EQ(30 * 24 * 60 * 60, dbfull()->GetOptions().ttl); + + options.ttl = 100; + Reopen(options); + ASSERT_EQ(100, dbfull()->GetOptions().ttl); + + options.ttl = 100 * 24 * 60 * 60; + Reopen(options); + ASSERT_EQ(100 * 24 * 60 * 60, dbfull()->GetOptions().ttl); + + options.ttl = 200; + options.periodic_compaction_seconds = 300; + Reopen(options); + ASSERT_EQ(200, dbfull()->GetOptions().ttl); + + options.ttl = 500; + options.periodic_compaction_seconds = 300; + Reopen(options); + ASSERT_EQ(300, dbfull()->GetOptions().ttl); +} + +TEST_F(DBOptionsTest, SetFIFOCompactionOptions) { + Options options; + options.compaction_style = kCompactionStyleFIFO; + options.write_buffer_size = 10 << 10; // 10KB + options.arena_block_size = 4096; + options.compression = kNoCompression; + options.create_if_missing = true; + options.compaction_options_fifo.allow_compaction = false; + env_->time_elapse_only_sleep_ = false; + options.env = env_; + + // Test dynamically changing ttl. + env_->addon_time_.store(0); + options.ttl = 1 * 60 * 60; // 1 hour + ASSERT_OK(TryReopen(options)); + + Random rnd(301); + for (int i = 0; i < 10; i++) { + // Generate and flush a file about 10KB. + for (int j = 0; j < 10; j++) { + ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980))); + } + Flush(); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(NumTableFilesAtLevel(0), 10); + + // Add 61 seconds to the time. + env_->addon_time_.fetch_add(61); + + // No files should be compacted as ttl is set to 1 hour. + ASSERT_EQ(dbfull()->GetOptions().ttl, 3600); + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_EQ(NumTableFilesAtLevel(0), 10); + + // Set ttl to 1 minute. So all files should get deleted. + ASSERT_OK(dbfull()->SetOptions({{"ttl", "60"}})); + ASSERT_EQ(dbfull()->GetOptions().ttl, 60); + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + + // Test dynamically changing compaction_options_fifo.max_table_files_size + env_->addon_time_.store(0); + options.compaction_options_fifo.max_table_files_size = 500 << 10; // 00KB + options.ttl = 0; + DestroyAndReopen(options); + + for (int i = 0; i < 10; i++) { + // Generate and flush a file about 10KB. + for (int j = 0; j < 10; j++) { + ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980))); + } + Flush(); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(NumTableFilesAtLevel(0), 10); + + // No files should be compacted as max_table_files_size is set to 500 KB. + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size, + 500 << 10); + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_EQ(NumTableFilesAtLevel(0), 10); + + // Set max_table_files_size to 12 KB. So only 1 file should remain now. + ASSERT_OK(dbfull()->SetOptions( + {{"compaction_options_fifo", "{max_table_files_size=12288;}"}})); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size, + 12 << 10); + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(NumTableFilesAtLevel(0), 1); + + // Test dynamically changing compaction_options_fifo.allow_compaction + options.compaction_options_fifo.max_table_files_size = 500 << 10; // 500KB + options.ttl = 0; + options.compaction_options_fifo.allow_compaction = false; + options.level0_file_num_compaction_trigger = 6; + DestroyAndReopen(options); + + for (int i = 0; i < 10; i++) { + // Generate and flush a file about 10KB. + for (int j = 0; j < 10; j++) { + ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980))); + } + Flush(); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(NumTableFilesAtLevel(0), 10); + + // No files should be compacted as max_table_files_size is set to 500 KB and + // allow_compaction is false + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction, + false); + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_EQ(NumTableFilesAtLevel(0), 10); + + // Set allow_compaction to true. So number of files should be between 1 and 5. + ASSERT_OK(dbfull()->SetOptions( + {{"compaction_options_fifo", "{allow_compaction=true;}"}})); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction, + true); + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_GE(NumTableFilesAtLevel(0), 1); + ASSERT_LE(NumTableFilesAtLevel(0), 5); +} + +TEST_F(DBOptionsTest, CompactionReadaheadSizeChange) { + SpecialEnv env(env_); + Options options; + options.env = &env; + + options.compaction_readahead_size = 0; + options.new_table_reader_for_compaction_inputs = true; + options.level0_file_num_compaction_trigger = 2; + const std::string kValue(1024, 'v'); + Reopen(options); + + ASSERT_EQ(0, dbfull()->GetDBOptions().compaction_readahead_size); + ASSERT_OK(dbfull()->SetDBOptions({{"compaction_readahead_size", "256"}})); + ASSERT_EQ(256, dbfull()->GetDBOptions().compaction_readahead_size); + for (int i = 0; i < 1024; i++) { + Put(Key(i), kValue); + } + Flush(); + for (int i = 0; i < 1024 * 2; i++) { + Put(Key(i), kValue); + } + Flush(); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(256, env_->compaction_readahead_size_); + Close(); +} + +TEST_F(DBOptionsTest, FIFOTtlBackwardCompatible) { + Options options; + options.compaction_style = kCompactionStyleFIFO; + options.write_buffer_size = 10 << 10; // 10KB + options.create_if_missing = true; + + ASSERT_OK(TryReopen(options)); + + Random rnd(301); + for (int i = 0; i < 10; i++) { + // Generate and flush a file about 10KB. + for (int j = 0; j < 10; j++) { + ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980))); + } + Flush(); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(NumTableFilesAtLevel(0), 10); + + // In release 6.0, ttl was promoted from a secondary level option under + // compaction_options_fifo to a top level option under ColumnFamilyOptions. + // We still need to handle old SetOptions calls but should ignore + // ttl under compaction_options_fifo. + ASSERT_OK(dbfull()->SetOptions( + {{"compaction_options_fifo", + "{allow_compaction=true;max_table_files_size=1024;ttl=731;}"}, + {"ttl", "60"}})); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction, + true); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size, + 1024); + ASSERT_EQ(dbfull()->GetOptions().ttl, 60); + + // Put ttl as the first option inside compaction_options_fifo. That works as + // it doesn't overwrite any other option. + ASSERT_OK(dbfull()->SetOptions( + {{"compaction_options_fifo", + "{ttl=985;allow_compaction=true;max_table_files_size=1024;}"}, + {"ttl", "191"}})); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction, + true); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size, + 1024); + ASSERT_EQ(dbfull()->GetOptions().ttl, 191); +} + +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_properties_test.cc b/src/rocksdb/db/db_properties_test.cc new file mode 100644 index 000000000..50dc3efef --- /dev/null +++ b/src/rocksdb/db/db_properties_test.cc @@ -0,0 +1,1711 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include + +#include +#include + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "rocksdb/listener.h" +#include "rocksdb/options.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/perf_level.h" +#include "rocksdb/table.h" +#include "util/random.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +class DBPropertiesTest : public DBTestBase { + public: + DBPropertiesTest() : DBTestBase("/db_properties_test") {} +}; + +#ifndef ROCKSDB_LITE +TEST_F(DBPropertiesTest, Empty) { + do { + Options options; + options.env = env_; + options.write_buffer_size = 100000; // Small write buffer + options.allow_concurrent_memtable_write = false; + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, options); + + std::string num; + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ("0", num); + + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ("1", num); + + // Block sync calls + env_->delay_sstable_sync_.store(true, std::memory_order_release); + Put(1, "k1", std::string(100000, 'x')); // Fill memtable + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ("2", num); + + Put(1, "k2", std::string(100000, 'y')); // Trigger compaction + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ("1", num); + + ASSERT_EQ("v1", Get(1, "foo")); + // Release sync calls + env_->delay_sstable_sync_.store(false, std::memory_order_release); + + ASSERT_OK(db_->DisableFileDeletions()); + ASSERT_TRUE( + dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); + ASSERT_EQ("0", num); + + ASSERT_OK(db_->DisableFileDeletions()); + ASSERT_TRUE( + dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); + ASSERT_EQ("0", num); + + ASSERT_OK(db_->DisableFileDeletions()); + ASSERT_TRUE( + dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); + ASSERT_EQ("0", num); + + ASSERT_OK(db_->EnableFileDeletions(false)); + ASSERT_TRUE( + dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); + ASSERT_EQ("0", num); + + ASSERT_OK(db_->EnableFileDeletions()); + ASSERT_TRUE( + dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); + ASSERT_EQ("1", num); + } while (ChangeOptions()); +} + +TEST_F(DBPropertiesTest, CurrentVersionNumber) { + uint64_t v1, v2, v3; + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.current-super-version-number", &v1)); + Put("12345678", ""); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.current-super-version-number", &v2)); + Flush(); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.current-super-version-number", &v3)); + + ASSERT_EQ(v1, v2); + ASSERT_GT(v3, v2); +} + +TEST_F(DBPropertiesTest, GetAggregatedIntPropertyTest) { + const int kKeySize = 100; + const int kValueSize = 500; + const int kKeyNum = 100; + + Options options; + options.env = env_; + options.create_if_missing = true; + options.write_buffer_size = (kKeySize + kValueSize) * kKeyNum / 10; + // Make them never flush + options.min_write_buffer_number_to_merge = 1000; + options.max_write_buffer_number = 1000; + options = CurrentOptions(options); + CreateAndReopenWithCF({"one", "two", "three", "four"}, options); + + Random rnd(301); + for (auto* handle : handles_) { + for (int i = 0; i < kKeyNum; ++i) { + db_->Put(WriteOptions(), handle, RandomString(&rnd, kKeySize), + RandomString(&rnd, kValueSize)); + } + } + + uint64_t manual_sum = 0; + uint64_t api_sum = 0; + uint64_t value = 0; + for (auto* handle : handles_) { + ASSERT_TRUE( + db_->GetIntProperty(handle, DB::Properties::kSizeAllMemTables, &value)); + manual_sum += value; + } + ASSERT_TRUE(db_->GetAggregatedIntProperty(DB::Properties::kSizeAllMemTables, + &api_sum)); + ASSERT_GT(manual_sum, 0); + ASSERT_EQ(manual_sum, api_sum); + + ASSERT_FALSE(db_->GetAggregatedIntProperty(DB::Properties::kDBStats, &value)); + + uint64_t before_flush_trm; + uint64_t after_flush_trm; + for (auto* handle : handles_) { + ASSERT_TRUE(db_->GetAggregatedIntProperty( + DB::Properties::kEstimateTableReadersMem, &before_flush_trm)); + + // Issue flush and expect larger memory usage of table readers. + db_->Flush(FlushOptions(), handle); + + ASSERT_TRUE(db_->GetAggregatedIntProperty( + DB::Properties::kEstimateTableReadersMem, &after_flush_trm)); + ASSERT_GT(after_flush_trm, before_flush_trm); + } +} + +namespace { +void ResetTableProperties(TableProperties* tp) { + tp->data_size = 0; + tp->index_size = 0; + tp->filter_size = 0; + tp->raw_key_size = 0; + tp->raw_value_size = 0; + tp->num_data_blocks = 0; + tp->num_entries = 0; + tp->num_deletions = 0; + tp->num_merge_operands = 0; + tp->num_range_deletions = 0; +} + +void ParseTablePropertiesString(std::string tp_string, TableProperties* tp) { + double dummy_double; + std::replace(tp_string.begin(), tp_string.end(), ';', ' '); + std::replace(tp_string.begin(), tp_string.end(), '=', ' '); + ResetTableProperties(tp); + sscanf(tp_string.c_str(), + "# data blocks %" SCNu64 " # entries %" SCNu64 " # deletions %" SCNu64 + " # merge operands %" SCNu64 " # range deletions %" SCNu64 + " raw key size %" SCNu64 + " raw average key size %lf " + " raw value size %" SCNu64 + " raw average value size %lf " + " data block size %" SCNu64 " index block size (user-key? %" SCNu64 + ", delta-value? %" SCNu64 ") %" SCNu64 " filter block size %" SCNu64, + &tp->num_data_blocks, &tp->num_entries, &tp->num_deletions, + &tp->num_merge_operands, &tp->num_range_deletions, &tp->raw_key_size, + &dummy_double, &tp->raw_value_size, &dummy_double, &tp->data_size, + &tp->index_key_is_user_key, &tp->index_value_is_delta_encoded, + &tp->index_size, &tp->filter_size); +} + +void VerifySimilar(uint64_t a, uint64_t b, double bias) { + ASSERT_EQ(a == 0U, b == 0U); + if (a == 0) { + return; + } + double dbl_a = static_cast(a); + double dbl_b = static_cast(b); + if (dbl_a > dbl_b) { + ASSERT_LT(static_cast(dbl_a - dbl_b) / (dbl_a + dbl_b), bias); + } else { + ASSERT_LT(static_cast(dbl_b - dbl_a) / (dbl_a + dbl_b), bias); + } +} + +void VerifyTableProperties( + const TableProperties& base_tp, const TableProperties& new_tp, + double filter_size_bias = CACHE_LINE_SIZE >= 256 ? 0.15 : 0.1, + double index_size_bias = 0.1, double data_size_bias = 0.1, + double num_data_blocks_bias = 0.05) { + VerifySimilar(base_tp.data_size, new_tp.data_size, data_size_bias); + VerifySimilar(base_tp.index_size, new_tp.index_size, index_size_bias); + VerifySimilar(base_tp.filter_size, new_tp.filter_size, filter_size_bias); + VerifySimilar(base_tp.num_data_blocks, new_tp.num_data_blocks, + num_data_blocks_bias); + + ASSERT_EQ(base_tp.raw_key_size, new_tp.raw_key_size); + ASSERT_EQ(base_tp.raw_value_size, new_tp.raw_value_size); + ASSERT_EQ(base_tp.num_entries, new_tp.num_entries); + ASSERT_EQ(base_tp.num_deletions, new_tp.num_deletions); + ASSERT_EQ(base_tp.num_range_deletions, new_tp.num_range_deletions); + + // Merge operands may become Puts, so we only have an upper bound the exact + // number of merge operands. + ASSERT_GE(base_tp.num_merge_operands, new_tp.num_merge_operands); +} + +void GetExpectedTableProperties( + TableProperties* expected_tp, const int kKeySize, const int kValueSize, + const int kPutsPerTable, const int kDeletionsPerTable, + const int kMergeOperandsPerTable, const int kRangeDeletionsPerTable, + const int kTableCount, const int kBloomBitsPerKey, const size_t kBlockSize, + const bool index_key_is_user_key, const bool value_delta_encoding) { + const int kKeysPerTable = + kPutsPerTable + kDeletionsPerTable + kMergeOperandsPerTable; + const int kPutCount = kTableCount * kPutsPerTable; + const int kDeletionCount = kTableCount * kDeletionsPerTable; + const int kMergeCount = kTableCount * kMergeOperandsPerTable; + const int kRangeDeletionCount = kTableCount * kRangeDeletionsPerTable; + const int kKeyCount = kPutCount + kDeletionCount + kMergeCount + kRangeDeletionCount; + const int kAvgSuccessorSize = kKeySize / 5; + const int kEncodingSavePerKey = kKeySize / 4; + expected_tp->raw_key_size = kKeyCount * (kKeySize + 8); + expected_tp->raw_value_size = + (kPutCount + kMergeCount + kRangeDeletionCount) * kValueSize; + expected_tp->num_entries = kKeyCount; + expected_tp->num_deletions = kDeletionCount + kRangeDeletionCount; + expected_tp->num_merge_operands = kMergeCount; + expected_tp->num_range_deletions = kRangeDeletionCount; + expected_tp->num_data_blocks = + kTableCount * (kKeysPerTable * (kKeySize - kEncodingSavePerKey + kValueSize)) / + kBlockSize; + expected_tp->data_size = + kTableCount * (kKeysPerTable * (kKeySize + 8 + kValueSize)); + expected_tp->index_size = + expected_tp->num_data_blocks * + (kAvgSuccessorSize + (index_key_is_user_key ? 0 : 8) - + // discount 1 byte as value size is not encoded in value delta encoding + (value_delta_encoding ? 1 : 0)); + expected_tp->filter_size = + kTableCount * ((kKeysPerTable * kBloomBitsPerKey + 7) / 8 + + /*average-ish overhead*/ CACHE_LINE_SIZE / 2); +} +} // anonymous namespace + +TEST_F(DBPropertiesTest, ValidatePropertyInfo) { + for (const auto& ppt_name_and_info : InternalStats::ppt_name_to_info) { + // If C++ gets a std::string_literal, this would be better to check at + // compile-time using static_assert. + ASSERT_TRUE(ppt_name_and_info.first.empty() || + !isdigit(ppt_name_and_info.first.back())); + + int count = 0; + count += (ppt_name_and_info.second.handle_string == nullptr) ? 0 : 1; + count += (ppt_name_and_info.second.handle_int == nullptr) ? 0 : 1; + count += (ppt_name_and_info.second.handle_string_dbimpl == nullptr) ? 0 : 1; + ASSERT_TRUE(count == 1); + } +} + +TEST_F(DBPropertiesTest, ValidateSampleNumber) { + // When "max_open_files" is -1, we read all the files for + // "rocksdb.estimate-num-keys" computation, which is the ground truth. + // Otherwise, we sample 20 newest files to make an estimation. + // Formula: lastest_20_files_active_key_ratio * total_files + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.level0_stop_writes_trigger = 1000; + DestroyAndReopen(options); + int key = 0; + for (int files = 20; files >= 10; files -= 10) { + for (int i = 0; i < files; i++) { + int rows = files / 10; + for (int j = 0; j < rows; j++) { + db_->Put(WriteOptions(), std::to_string(++key), "foo"); + } + db_->Flush(FlushOptions()); + } + } + std::string num; + Reopen(options); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num)); + ASSERT_EQ("45", num); + options.max_open_files = -1; + Reopen(options); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num)); + ASSERT_EQ("50", num); +} + +TEST_F(DBPropertiesTest, AggregatedTableProperties) { + for (int kTableCount = 40; kTableCount <= 100; kTableCount += 30) { + const int kDeletionsPerTable = 5; + const int kMergeOperandsPerTable = 15; + const int kRangeDeletionsPerTable = 5; + const int kPutsPerTable = 100; + const int kKeySize = 80; + const int kValueSize = 200; + const int kBloomBitsPerKey = 20; + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 8; + options.compression = kNoCompression; + options.create_if_missing = true; + options.preserve_deletes = true; + options.merge_operator.reset(new TestPutOperator()); + + BlockBasedTableOptions table_options; + table_options.filter_policy.reset( + NewBloomFilterPolicy(kBloomBitsPerKey, false)); + table_options.block_size = 1024; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + + DestroyAndReopen(options); + + // Hold open a snapshot to prevent range tombstones from being compacted + // away. + ManagedSnapshot snapshot(db_); + + Random rnd(5632); + for (int table = 1; table <= kTableCount; ++table) { + for (int i = 0; i < kPutsPerTable; ++i) { + db_->Put(WriteOptions(), RandomString(&rnd, kKeySize), + RandomString(&rnd, kValueSize)); + } + for (int i = 0; i < kDeletionsPerTable; i++) { + db_->Delete(WriteOptions(), RandomString(&rnd, kKeySize)); + } + for (int i = 0; i < kMergeOperandsPerTable; i++) { + db_->Merge(WriteOptions(), RandomString(&rnd, kKeySize), + RandomString(&rnd, kValueSize)); + } + for (int i = 0; i < kRangeDeletionsPerTable; i++) { + std::string start = RandomString(&rnd, kKeySize); + std::string end = start; + end.resize(kValueSize); + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end); + } + db_->Flush(FlushOptions()); + } + std::string property; + db_->GetProperty(DB::Properties::kAggregatedTableProperties, &property); + TableProperties output_tp; + ParseTablePropertiesString(property, &output_tp); + bool index_key_is_user_key = output_tp.index_key_is_user_key > 0; + bool value_is_delta_encoded = output_tp.index_value_is_delta_encoded > 0; + + TableProperties expected_tp; + GetExpectedTableProperties( + &expected_tp, kKeySize, kValueSize, kPutsPerTable, kDeletionsPerTable, + kMergeOperandsPerTable, kRangeDeletionsPerTable, kTableCount, + kBloomBitsPerKey, table_options.block_size, index_key_is_user_key, + value_is_delta_encoded); + + VerifyTableProperties(expected_tp, output_tp); + } +} + +TEST_F(DBPropertiesTest, ReadLatencyHistogramByLevel) { + Options options = CurrentOptions(); + options.write_buffer_size = 110 << 10; + options.level0_file_num_compaction_trigger = 6; + options.num_levels = 4; + options.compression = kNoCompression; + options.max_bytes_for_level_base = 4500 << 10; + options.target_file_size_base = 98 << 10; + options.max_write_buffer_number = 2; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.max_open_files = 11; // Make sure no proloading of table readers + + // RocksDB sanitize max open files to at least 20. Modify it back. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) { + int* max_open_files = static_cast(arg); + *max_open_files = 11; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + + CreateAndReopenWithCF({"pikachu"}, options); + int key_index = 0; + Random rnd(301); + for (int num = 0; num < 8; num++) { + Put("foo", "bar"); + GenerateNewFile(&rnd, &key_index); + dbfull()->TEST_WaitForCompact(); + } + dbfull()->TEST_WaitForCompact(); + + std::string prop; + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop)); + + // Get() after flushes, See latency histogram tracked. + for (int key = 0; key < key_index; key++) { + Get(Key(key)); + } + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cfstats", &prop)); + ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram")); + ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); + + // Reopen and issue Get(). See thee latency tracked + ReopenWithColumnFamilies({"default", "pikachu"}, options); + dbfull()->TEST_WaitForCompact(); + for (int key = 0; key < key_index; key++) { + Get(Key(key)); + } + + // Test for getting immutable_db_options_.statistics + ASSERT_TRUE(dbfull()->GetProperty(dbfull()->DefaultColumnFamily(), + "rocksdb.options-statistics", &prop)); + ASSERT_NE(std::string::npos, prop.find("rocksdb.block.cache.miss")); + ASSERT_EQ(std::string::npos, prop.find("rocksdb.db.f.micros")); + + ASSERT_TRUE(dbfull()->GetProperty(dbfull()->DefaultColumnFamily(), + "rocksdb.cf-file-histogram", &prop)); + ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram")); + ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); + + // Reopen and issue iterating. See thee latency tracked + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cf-file-histogram", &prop)); + ASSERT_EQ(std::string::npos, prop.find("** Level 0 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); + { + std::unique_ptr iter(db_->NewIterator(ReadOptions())); + for (iter->Seek(Key(0)); iter->Valid(); iter->Next()) { + } + } + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cf-file-histogram", &prop)); + ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram")); + ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); + + // CF 1 should show no histogram. + ASSERT_TRUE( + dbfull()->GetProperty(handles_[1], "rocksdb.cf-file-histogram", &prop)); + ASSERT_EQ(std::string::npos, prop.find("** Level 0 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); + // put something and read it back , CF 1 should show histogram. + Put(1, "foo", "bar"); + Flush(1); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ("bar", Get(1, "foo")); + + ASSERT_TRUE( + dbfull()->GetProperty(handles_[1], "rocksdb.cf-file-histogram", &prop)); + ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); + + // options.max_open_files preloads table readers. + options.max_open_files = -1; + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_TRUE(dbfull()->GetProperty(dbfull()->DefaultColumnFamily(), + "rocksdb.cf-file-histogram", &prop)); + ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram")); + ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); + for (int key = 0; key < key_index; key++) { + Get(Key(key)); + } + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cfstats", &prop)); + ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram")); + ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); + + // Clear internal stats + dbfull()->ResetStats(); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cfstats", &prop)); + ASSERT_EQ(std::string::npos, prop.find("** Level 0 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); +} + +TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) { + const int kTableCount = 100; + const int kDeletionsPerTable = 2; + const int kMergeOperandsPerTable = 2; + const int kRangeDeletionsPerTable = 2; + const int kPutsPerTable = 10; + const int kKeySize = 50; + const int kValueSize = 400; + const int kMaxLevel = 7; + const int kBloomBitsPerKey = 20; + Random rnd(301); + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 8; + options.compression = kNoCompression; + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = 2; + options.target_file_size_base = 8192; + options.max_bytes_for_level_base = 10000; + options.max_bytes_for_level_multiplier = 2; + // This ensures there no compaction happening when we call GetProperty(). + options.disable_auto_compactions = true; + options.preserve_deletes = true; + options.merge_operator.reset(new TestPutOperator()); + + BlockBasedTableOptions table_options; + table_options.filter_policy.reset( + NewBloomFilterPolicy(kBloomBitsPerKey, false)); + table_options.block_size = 1024; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + + DestroyAndReopen(options); + + // Hold open a snapshot to prevent range tombstones from being compacted away. + ManagedSnapshot snapshot(db_); + + std::string level_tp_strings[kMaxLevel]; + std::string tp_string; + TableProperties level_tps[kMaxLevel]; + TableProperties tp, sum_tp, expected_tp; + for (int table = 1; table <= kTableCount; ++table) { + for (int i = 0; i < kPutsPerTable; ++i) { + db_->Put(WriteOptions(), RandomString(&rnd, kKeySize), + RandomString(&rnd, kValueSize)); + } + for (int i = 0; i < kDeletionsPerTable; i++) { + db_->Delete(WriteOptions(), RandomString(&rnd, kKeySize)); + } + for (int i = 0; i < kMergeOperandsPerTable; i++) { + db_->Merge(WriteOptions(), RandomString(&rnd, kKeySize), + RandomString(&rnd, kValueSize)); + } + for (int i = 0; i < kRangeDeletionsPerTable; i++) { + std::string start = RandomString(&rnd, kKeySize); + std::string end = start; + end.resize(kValueSize); + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end); + } + db_->Flush(FlushOptions()); + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ResetTableProperties(&sum_tp); + for (int level = 0; level < kMaxLevel; ++level) { + db_->GetProperty( + DB::Properties::kAggregatedTablePropertiesAtLevel + ToString(level), + &level_tp_strings[level]); + ParseTablePropertiesString(level_tp_strings[level], &level_tps[level]); + sum_tp.data_size += level_tps[level].data_size; + sum_tp.index_size += level_tps[level].index_size; + sum_tp.filter_size += level_tps[level].filter_size; + sum_tp.raw_key_size += level_tps[level].raw_key_size; + sum_tp.raw_value_size += level_tps[level].raw_value_size; + sum_tp.num_data_blocks += level_tps[level].num_data_blocks; + sum_tp.num_entries += level_tps[level].num_entries; + sum_tp.num_deletions += level_tps[level].num_deletions; + sum_tp.num_merge_operands += level_tps[level].num_merge_operands; + sum_tp.num_range_deletions += level_tps[level].num_range_deletions; + } + db_->GetProperty(DB::Properties::kAggregatedTableProperties, &tp_string); + ParseTablePropertiesString(tp_string, &tp); + bool index_key_is_user_key = tp.index_key_is_user_key > 0; + bool value_is_delta_encoded = tp.index_value_is_delta_encoded > 0; + ASSERT_EQ(sum_tp.data_size, tp.data_size); + ASSERT_EQ(sum_tp.index_size, tp.index_size); + ASSERT_EQ(sum_tp.filter_size, tp.filter_size); + ASSERT_EQ(sum_tp.raw_key_size, tp.raw_key_size); + ASSERT_EQ(sum_tp.raw_value_size, tp.raw_value_size); + ASSERT_EQ(sum_tp.num_data_blocks, tp.num_data_blocks); + ASSERT_EQ(sum_tp.num_entries, tp.num_entries); + ASSERT_EQ(sum_tp.num_deletions, tp.num_deletions); + ASSERT_EQ(sum_tp.num_merge_operands, tp.num_merge_operands); + ASSERT_EQ(sum_tp.num_range_deletions, tp.num_range_deletions); + if (table > 3) { + GetExpectedTableProperties( + &expected_tp, kKeySize, kValueSize, kPutsPerTable, kDeletionsPerTable, + kMergeOperandsPerTable, kRangeDeletionsPerTable, table, + kBloomBitsPerKey, table_options.block_size, index_key_is_user_key, + value_is_delta_encoded); + // Gives larger bias here as index block size, filter block size, + // and data block size become much harder to estimate in this test. + VerifyTableProperties(expected_tp, tp, 0.5, 0.4, 0.4, 0.25); + } + } +} + +TEST_F(DBPropertiesTest, NumImmutableMemTable) { + do { + Options options = CurrentOptions(); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 3; + options.write_buffer_size = 1000000; + options.max_write_buffer_size_to_maintain = + 5 * static_cast(options.write_buffer_size); + CreateAndReopenWithCF({"pikachu"}, options); + + std::string big_value(1000000 * 2, 'x'); + std::string num; + uint64_t value; + SetPerfLevel(kEnableTime); + ASSERT_TRUE(GetPerfLevel() == kEnableTime); + + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k1", big_value)); + ASSERT_TRUE(dbfull()->GetProperty(handles_[1], + "rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], DB::Properties::kNumImmutableMemTableFlushed, &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ(num, "1"); + get_perf_context()->Reset(); + Get(1, "k1"); + ASSERT_EQ(1, static_cast(get_perf_context()->get_from_memtable_count)); + + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value)); + ASSERT_TRUE(dbfull()->GetProperty(handles_[1], + "rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "1"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ(num, "1"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-imm-mem-tables", &num)); + ASSERT_EQ(num, "1"); + + get_perf_context()->Reset(); + Get(1, "k1"); + ASSERT_EQ(2, static_cast(get_perf_context()->get_from_memtable_count)); + get_perf_context()->Reset(); + Get(1, "k2"); + ASSERT_EQ(1, static_cast(get_perf_context()->get_from_memtable_count)); + + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k3", big_value)); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.cur-size-active-mem-table", &num)); + ASSERT_TRUE(dbfull()->GetProperty(handles_[1], + "rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "2"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ(num, "1"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-imm-mem-tables", &num)); + ASSERT_EQ(num, "2"); + get_perf_context()->Reset(); + Get(1, "k2"); + ASSERT_EQ(2, static_cast(get_perf_context()->get_from_memtable_count)); + get_perf_context()->Reset(); + Get(1, "k3"); + ASSERT_EQ(1, static_cast(get_perf_context()->get_from_memtable_count)); + get_perf_context()->Reset(); + Get(1, "k1"); + ASSERT_EQ(3, static_cast(get_perf_context()->get_from_memtable_count)); + + ASSERT_OK(Flush(1)); + ASSERT_TRUE(dbfull()->GetProperty(handles_[1], + "rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], DB::Properties::kNumImmutableMemTableFlushed, &num)); + ASSERT_EQ(num, "3"); + ASSERT_TRUE(dbfull()->GetIntProperty( + handles_[1], "rocksdb.cur-size-active-mem-table", &value)); + // "192" is the size of the metadata of two empty skiplists, this would + // break if we change the default skiplist implementation + ASSERT_GE(value, 192); + + uint64_t int_num; + uint64_t base_total_size; + ASSERT_TRUE(dbfull()->GetIntProperty( + handles_[1], "rocksdb.estimate-num-keys", &base_total_size)); + + ASSERT_OK(dbfull()->Delete(writeOpt, handles_[1], "k2")); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k3", "")); + ASSERT_OK(dbfull()->Delete(writeOpt, handles_[1], "k3")); + ASSERT_TRUE(dbfull()->GetIntProperty( + handles_[1], "rocksdb.num-deletes-active-mem-table", &int_num)); + ASSERT_EQ(int_num, 2U); + ASSERT_TRUE(dbfull()->GetIntProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &int_num)); + ASSERT_EQ(int_num, 3U); + + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value)); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value)); + ASSERT_TRUE(dbfull()->GetIntProperty( + handles_[1], "rocksdb.num-entries-imm-mem-tables", &int_num)); + ASSERT_EQ(int_num, 4U); + ASSERT_TRUE(dbfull()->GetIntProperty( + handles_[1], "rocksdb.num-deletes-imm-mem-tables", &int_num)); + ASSERT_EQ(int_num, 2U); + + ASSERT_TRUE(dbfull()->GetIntProperty( + handles_[1], "rocksdb.estimate-num-keys", &int_num)); + ASSERT_EQ(int_num, base_total_size + 1); + + SetPerfLevel(kDisable); + ASSERT_TRUE(GetPerfLevel() == kDisable); + } while (ChangeCompactOptions()); +} + +// TODO(techdept) : Disabled flaky test #12863555 +TEST_F(DBPropertiesTest, DISABLED_GetProperty) { + // Set sizes to both background thread pool to be 1 and block them. + env_->SetBackgroundThreads(1, Env::HIGH); + env_->SetBackgroundThreads(1, Env::LOW); + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + test::SleepingBackgroundTask sleeping_task_high; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_task_high, Env::Priority::HIGH); + + Options options = CurrentOptions(); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + options.compaction_style = kCompactionStyleUniversal; + options.level0_file_num_compaction_trigger = 1; + options.compaction_options_universal.size_ratio = 50; + options.max_background_compactions = 1; + options.max_background_flushes = 1; + options.max_write_buffer_number = 10; + options.min_write_buffer_number_to_merge = 1; + options.max_write_buffer_size_to_maintain = 0; + options.write_buffer_size = 1000000; + Reopen(options); + + std::string big_value(1000000 * 2, 'x'); + std::string num; + uint64_t int_num; + SetPerfLevel(kEnableTime); + + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num)); + ASSERT_EQ(int_num, 0U); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.estimate-live-data-size", &int_num)); + ASSERT_EQ(int_num, 0U); + + ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value)); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num)); + ASSERT_EQ(num, "1"); + get_perf_context()->Reset(); + + ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value)); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "1"); + ASSERT_OK(dbfull()->Delete(writeOpt, "k-non-existing")); + ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value)); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "2"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num)); + ASSERT_EQ(num, "1"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num)); + ASSERT_EQ(num, "2"); + // Verify the same set of properties through GetIntProperty + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.num-immutable-mem-table", &int_num)); + ASSERT_EQ(int_num, 2U); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.mem-table-flush-pending", &int_num)); + ASSERT_EQ(int_num, 1U); + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.compaction-pending", &int_num)); + ASSERT_EQ(int_num, 0U); + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.estimate-num-keys", &int_num)); + ASSERT_EQ(int_num, 2U); + + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num)); + ASSERT_EQ(int_num, 0U); + + sleeping_task_high.WakeUp(); + sleeping_task_high.WaitUntilDone(); + dbfull()->TEST_WaitForFlushMemTable(); + + ASSERT_OK(dbfull()->Put(writeOpt, "k4", big_value)); + ASSERT_OK(dbfull()->Put(writeOpt, "k5", big_value)); + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num)); + ASSERT_EQ(num, "1"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num)); + ASSERT_EQ(num, "4"); + + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num)); + ASSERT_GT(int_num, 0U); + + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); + + // Wait for compaction to be done. This is important because otherwise RocksDB + // might schedule a compaction when reopening the database, failing assertion + // (A) as a result. + dbfull()->TEST_WaitForCompact(); + options.max_open_files = 10; + Reopen(options); + // After reopening, no table reader is loaded, so no memory for table readers + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num)); + ASSERT_EQ(int_num, 0U); // (A) + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.estimate-num-keys", &int_num)); + ASSERT_GT(int_num, 0U); + + // After reading a key, at least one table reader is loaded. + Get("k5"); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num)); + ASSERT_GT(int_num, 0U); + + // Test rocksdb.num-live-versions + { + options.level0_file_num_compaction_trigger = 20; + Reopen(options); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num)); + ASSERT_EQ(int_num, 1U); + + // Use an iterator to hold current version + std::unique_ptr iter1(dbfull()->NewIterator(ReadOptions())); + + ASSERT_OK(dbfull()->Put(writeOpt, "k6", big_value)); + Flush(); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num)); + ASSERT_EQ(int_num, 2U); + + // Use an iterator to hold current version + std::unique_ptr iter2(dbfull()->NewIterator(ReadOptions())); + + ASSERT_OK(dbfull()->Put(writeOpt, "k7", big_value)); + Flush(); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num)); + ASSERT_EQ(int_num, 3U); + + iter2.reset(); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num)); + ASSERT_EQ(int_num, 2U); + + iter1.reset(); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num)); + ASSERT_EQ(int_num, 1U); + } +} + +TEST_F(DBPropertiesTest, ApproximateMemoryUsage) { + const int kNumRounds = 10; + // TODO(noetzli) kFlushesPerRound does not really correlate with how many + // flushes happen. + const int kFlushesPerRound = 10; + const int kWritesPerFlush = 10; + const int kKeySize = 100; + const int kValueSize = 1000; + Options options; + options.write_buffer_size = 1000; // small write buffer + options.min_write_buffer_number_to_merge = 4; + options.compression = kNoCompression; + options.create_if_missing = true; + options = CurrentOptions(options); + DestroyAndReopen(options); + + Random rnd(301); + + std::vector iters; + + uint64_t active_mem; + uint64_t unflushed_mem; + uint64_t all_mem; + uint64_t prev_all_mem; + + // Phase 0. The verify the initial value of all these properties are the same + // as we have no mem-tables. + dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem); + dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem); + dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); + ASSERT_EQ(all_mem, active_mem); + ASSERT_EQ(all_mem, unflushed_mem); + + // Phase 1. Simply issue Put() and expect "cur-size-all-mem-tables" equals to + // "size-all-mem-tables" + for (int r = 0; r < kNumRounds; ++r) { + for (int f = 0; f < kFlushesPerRound; ++f) { + for (int w = 0; w < kWritesPerFlush; ++w) { + Put(RandomString(&rnd, kKeySize), RandomString(&rnd, kValueSize)); + } + } + // Make sure that there is no flush between getting the two properties. + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem); + dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); + // in no iterator case, these two number should be the same. + ASSERT_EQ(unflushed_mem, all_mem); + } + prev_all_mem = all_mem; + + // Phase 2. Keep issuing Put() but also create new iterators. This time we + // expect "size-all-mem-tables" > "cur-size-all-mem-tables". + for (int r = 0; r < kNumRounds; ++r) { + iters.push_back(db_->NewIterator(ReadOptions())); + for (int f = 0; f < kFlushesPerRound; ++f) { + for (int w = 0; w < kWritesPerFlush; ++w) { + Put(RandomString(&rnd, kKeySize), RandomString(&rnd, kValueSize)); + } + } + // Force flush to prevent flush from happening between getting the + // properties or after getting the properties and before the new round. + Flush(); + + // In the second round, add iterators. + dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem); + dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem); + dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); + ASSERT_GT(all_mem, active_mem); + ASSERT_GT(all_mem, unflushed_mem); + ASSERT_GT(all_mem, prev_all_mem); + prev_all_mem = all_mem; + } + + // Phase 3. Delete iterators and expect "size-all-mem-tables" shrinks + // whenever we release an iterator. + for (auto* iter : iters) { + delete iter; + dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); + // Expect the size shrinking + ASSERT_LT(all_mem, prev_all_mem); + prev_all_mem = all_mem; + } + + // Expect all these three counters to be the same. + dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem); + dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem); + dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); + ASSERT_EQ(active_mem, unflushed_mem); + ASSERT_EQ(unflushed_mem, all_mem); + + // Phase 5. Reopen, and expect all these three counters to be the same again. + Reopen(options); + dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem); + dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem); + dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); + ASSERT_EQ(active_mem, unflushed_mem); + ASSERT_EQ(unflushed_mem, all_mem); +} + +TEST_F(DBPropertiesTest, EstimatePendingCompBytes) { + // Set sizes to both background thread pool to be 1 and block them. + env_->SetBackgroundThreads(1, Env::HIGH); + env_->SetBackgroundThreads(1, Env::LOW); + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + + Options options = CurrentOptions(); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + options.compaction_style = kCompactionStyleLevel; + options.level0_file_num_compaction_trigger = 2; + options.max_background_compactions = 1; + options.max_background_flushes = 1; + options.max_write_buffer_number = 10; + options.min_write_buffer_number_to_merge = 1; + options.max_write_buffer_size_to_maintain = 0; + options.write_buffer_size = 1000000; + Reopen(options); + + std::string big_value(1000000 * 2, 'x'); + std::string num; + uint64_t int_num; + + ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value)); + Flush(); + ASSERT_TRUE(dbfull()->GetIntProperty( + "rocksdb.estimate-pending-compaction-bytes", &int_num)); + ASSERT_EQ(int_num, 0U); + + ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value)); + Flush(); + ASSERT_TRUE(dbfull()->GetIntProperty( + "rocksdb.estimate-pending-compaction-bytes", &int_num)); + ASSERT_GT(int_num, 0U); + + ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value)); + Flush(); + ASSERT_TRUE(dbfull()->GetIntProperty( + "rocksdb.estimate-pending-compaction-bytes", &int_num)); + ASSERT_GT(int_num, 0U); + + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); + + dbfull()->TEST_WaitForCompact(); + ASSERT_TRUE(dbfull()->GetIntProperty( + "rocksdb.estimate-pending-compaction-bytes", &int_num)); + ASSERT_EQ(int_num, 0U); +} + +TEST_F(DBPropertiesTest, EstimateCompressionRatio) { + if (!Snappy_Supported()) { + return; + } + const int kNumL0Files = 3; + const int kNumEntriesPerFile = 1000; + + Options options = CurrentOptions(); + options.compression_per_level = {kNoCompression, kSnappyCompression}; + options.disable_auto_compactions = true; + options.num_levels = 2; + Reopen(options); + + // compression ratio is -1.0 when no open files at level + ASSERT_EQ(CompressionRatioAtLevel(0), -1.0); + + const std::string kVal(100, 'a'); + for (int i = 0; i < kNumL0Files; ++i) { + for (int j = 0; j < kNumEntriesPerFile; ++j) { + // Put common data ("key") at end to prevent delta encoding from + // compressing the key effectively + std::string key = ToString(i) + ToString(j) + "key"; + ASSERT_OK(dbfull()->Put(WriteOptions(), key, kVal)); + } + Flush(); + } + + // no compression at L0, so ratio is less than one + ASSERT_LT(CompressionRatioAtLevel(0), 1.0); + ASSERT_GT(CompressionRatioAtLevel(0), 0.0); + ASSERT_EQ(CompressionRatioAtLevel(1), -1.0); + + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + + ASSERT_EQ(CompressionRatioAtLevel(0), -1.0); + // Data at L1 should be highly compressed thanks to Snappy and redundant data + // in values (ratio is 12.846 as of 4/19/2016). + ASSERT_GT(CompressionRatioAtLevel(1), 10.0); +} + +#endif // ROCKSDB_LITE + +class CountingUserTblPropCollector : public TablePropertiesCollector { + public: + const char* Name() const override { return "CountingUserTblPropCollector"; } + + Status Finish(UserCollectedProperties* properties) override { + std::string encoded; + PutVarint32(&encoded, count_); + *properties = UserCollectedProperties{ + {"CountingUserTblPropCollector", message_}, {"Count", encoded}, + }; + return Status::OK(); + } + + Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/, + EntryType /*type*/, SequenceNumber /*seq*/, + uint64_t /*file_size*/) override { + ++count_; + return Status::OK(); + } + + UserCollectedProperties GetReadableProperties() const override { + return UserCollectedProperties{}; + } + + private: + std::string message_ = "Rocksdb"; + uint32_t count_ = 0; +}; + +class CountingUserTblPropCollectorFactory + : public TablePropertiesCollectorFactory { + public: + explicit CountingUserTblPropCollectorFactory( + uint32_t expected_column_family_id) + : expected_column_family_id_(expected_column_family_id), + num_created_(0) {} + TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context context) override { + EXPECT_EQ(expected_column_family_id_, context.column_family_id); + num_created_++; + return new CountingUserTblPropCollector(); + } + const char* Name() const override { + return "CountingUserTblPropCollectorFactory"; + } + void set_expected_column_family_id(uint32_t v) { + expected_column_family_id_ = v; + } + uint32_t expected_column_family_id_; + uint32_t num_created_; +}; + +class CountingDeleteTabPropCollector : public TablePropertiesCollector { + public: + const char* Name() const override { return "CountingDeleteTabPropCollector"; } + + Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/, + EntryType type, SequenceNumber /*seq*/, + uint64_t /*file_size*/) override { + if (type == kEntryDelete) { + num_deletes_++; + } + return Status::OK(); + } + + bool NeedCompact() const override { return num_deletes_ > 10; } + + UserCollectedProperties GetReadableProperties() const override { + return UserCollectedProperties{}; + } + + Status Finish(UserCollectedProperties* properties) override { + *properties = + UserCollectedProperties{{"num_delete", ToString(num_deletes_)}}; + return Status::OK(); + } + + private: + uint32_t num_deletes_ = 0; +}; + +class CountingDeleteTabPropCollectorFactory + : public TablePropertiesCollectorFactory { + public: + TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context /*context*/) override { + return new CountingDeleteTabPropCollector(); + } + const char* Name() const override { + return "CountingDeleteTabPropCollectorFactory"; + } +}; + +#ifndef ROCKSDB_LITE +TEST_F(DBPropertiesTest, GetUserDefinedTableProperties) { + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = (1 << 30); + options.table_properties_collector_factories.resize(1); + std::shared_ptr collector_factory = + std::make_shared(0); + options.table_properties_collector_factories[0] = collector_factory; + Reopen(options); + // Create 4 tables + for (int table = 0; table < 4; ++table) { + for (int i = 0; i < 10 + table; ++i) { + db_->Put(WriteOptions(), ToString(table * 100 + i), "val"); + } + db_->Flush(FlushOptions()); + } + + TablePropertiesCollection props; + ASSERT_OK(db_->GetPropertiesOfAllTables(&props)); + ASSERT_EQ(4U, props.size()); + uint32_t sum = 0; + for (const auto& item : props) { + auto& user_collected = item.second->user_collected_properties; + ASSERT_TRUE(user_collected.find("CountingUserTblPropCollector") != + user_collected.end()); + ASSERT_EQ(user_collected.at("CountingUserTblPropCollector"), "Rocksdb"); + ASSERT_TRUE(user_collected.find("Count") != user_collected.end()); + Slice key(user_collected.at("Count")); + uint32_t count; + ASSERT_TRUE(GetVarint32(&key, &count)); + sum += count; + } + ASSERT_EQ(10u + 11u + 12u + 13u, sum); + + ASSERT_GT(collector_factory->num_created_, 0U); + collector_factory->num_created_ = 0; + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + ASSERT_GT(collector_factory->num_created_, 0U); +} +#endif // ROCKSDB_LITE + +TEST_F(DBPropertiesTest, UserDefinedTablePropertiesContext) { + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 3; + options.table_properties_collector_factories.resize(1); + std::shared_ptr collector_factory = + std::make_shared(1); + options.table_properties_collector_factories[0] = collector_factory, + CreateAndReopenWithCF({"pikachu"}, options); + // Create 2 files + for (int table = 0; table < 2; ++table) { + for (int i = 0; i < 10 + table; ++i) { + Put(1, ToString(table * 100 + i), "val"); + } + Flush(1); + } + ASSERT_GT(collector_factory->num_created_, 0U); + + collector_factory->num_created_ = 0; + // Trigger automatic compactions. + for (int table = 0; table < 3; ++table) { + for (int i = 0; i < 10 + table; ++i) { + Put(1, ToString(table * 100 + i), "val"); + } + Flush(1); + dbfull()->TEST_WaitForCompact(); + } + ASSERT_GT(collector_factory->num_created_, 0U); + + collector_factory->num_created_ = 0; + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + ASSERT_GT(collector_factory->num_created_, 0U); + + // Come back to write to default column family + collector_factory->num_created_ = 0; + collector_factory->set_expected_column_family_id(0); // default CF + // Create 4 tables in default column family + for (int table = 0; table < 2; ++table) { + for (int i = 0; i < 10 + table; ++i) { + Put(ToString(table * 100 + i), "val"); + } + Flush(); + } + ASSERT_GT(collector_factory->num_created_, 0U); + + collector_factory->num_created_ = 0; + // Trigger automatic compactions. + for (int table = 0; table < 3; ++table) { + for (int i = 0; i < 10 + table; ++i) { + Put(ToString(table * 100 + i), "val"); + } + Flush(); + dbfull()->TEST_WaitForCompact(); + } + ASSERT_GT(collector_factory->num_created_, 0U); + + collector_factory->num_created_ = 0; + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + ASSERT_GT(collector_factory->num_created_, 0U); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBPropertiesTest, TablePropertiesNeedCompactTest) { + Random rnd(301); + + Options options; + options.create_if_missing = true; + options.write_buffer_size = 4096; + options.max_write_buffer_number = 8; + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 2; + options.level0_stop_writes_trigger = 4; + options.target_file_size_base = 2048; + options.max_bytes_for_level_base = 10240; + options.max_bytes_for_level_multiplier = 4; + options.soft_pending_compaction_bytes_limit = 1024 * 1024; + options.num_levels = 8; + options.env = env_; + + std::shared_ptr collector_factory = + std::make_shared(); + options.table_properties_collector_factories.resize(1); + options.table_properties_collector_factories[0] = collector_factory; + + DestroyAndReopen(options); + + const int kMaxKey = 1000; + for (int i = 0; i < kMaxKey; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 102))); + ASSERT_OK(Put(Key(kMaxKey + i), RandomString(&rnd, 102))); + } + Flush(); + dbfull()->TEST_WaitForCompact(); + if (NumTableFilesAtLevel(0) == 1) { + // Clear Level 0 so that when later flush a file with deletions, + // we don't trigger an organic compaction. + ASSERT_OK(Put(Key(0), "")); + ASSERT_OK(Put(Key(kMaxKey * 2), "")); + Flush(); + dbfull()->TEST_WaitForCompact(); + } + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + + { + int c = 0; + std::unique_ptr iter(db_->NewIterator(ReadOptions())); + iter->Seek(Key(kMaxKey - 100)); + while (iter->Valid() && iter->key().compare(Key(kMaxKey + 100)) < 0) { + iter->Next(); + ++c; + } + ASSERT_EQ(c, 200); + } + + Delete(Key(0)); + for (int i = kMaxKey - 100; i < kMaxKey + 100; i++) { + Delete(Key(i)); + } + Delete(Key(kMaxKey * 2)); + + Flush(); + dbfull()->TEST_WaitForCompact(); + + { + SetPerfLevel(kEnableCount); + get_perf_context()->Reset(); + int c = 0; + std::unique_ptr iter(db_->NewIterator(ReadOptions())); + iter->Seek(Key(kMaxKey - 100)); + while (iter->Valid() && iter->key().compare(Key(kMaxKey + 100)) < 0) { + iter->Next(); + } + ASSERT_EQ(c, 0); + ASSERT_LT(get_perf_context()->internal_delete_skipped_count, 30u); + ASSERT_LT(get_perf_context()->internal_key_skipped_count, 30u); + SetPerfLevel(kDisable); + } +} + +TEST_F(DBPropertiesTest, NeedCompactHintPersistentTest) { + Random rnd(301); + + Options options; + options.create_if_missing = true; + options.max_write_buffer_number = 8; + options.level0_file_num_compaction_trigger = 10; + options.level0_slowdown_writes_trigger = 10; + options.level0_stop_writes_trigger = 10; + options.disable_auto_compactions = true; + options.env = env_; + + std::shared_ptr collector_factory = + std::make_shared(); + options.table_properties_collector_factories.resize(1); + options.table_properties_collector_factories[0] = collector_factory; + + DestroyAndReopen(options); + + const int kMaxKey = 100; + for (int i = 0; i < kMaxKey; i++) { + ASSERT_OK(Put(Key(i), "")); + } + Flush(); + dbfull()->TEST_WaitForFlushMemTable(); + + for (int i = 1; i < kMaxKey - 1; i++) { + Delete(Key(i)); + } + Flush(); + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(NumTableFilesAtLevel(0), 2); + + // Restart the DB. Although number of files didn't reach + // options.level0_file_num_compaction_trigger, compaction should + // still be triggered because of the need-compaction hint. + options.disable_auto_compactions = false; + Reopen(options); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + { + SetPerfLevel(kEnableCount); + get_perf_context()->Reset(); + int c = 0; + std::unique_ptr iter(db_->NewIterator(ReadOptions())); + for (iter->Seek(Key(0)); iter->Valid(); iter->Next()) { + c++; + } + ASSERT_EQ(c, 2); + ASSERT_EQ(get_perf_context()->internal_delete_skipped_count, 0); + // We iterate every key twice. Is it a bug? + ASSERT_LE(get_perf_context()->internal_key_skipped_count, 2); + SetPerfLevel(kDisable); + } +} + +TEST_F(DBPropertiesTest, EstimateNumKeysUnderflow) { + Options options; + Reopen(options); + Put("foo", "bar"); + Delete("foo"); + Delete("foo"); + uint64_t num_keys = 0; + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.estimate-num-keys", &num_keys)); + ASSERT_EQ(0, num_keys); +} + +TEST_F(DBPropertiesTest, EstimateOldestKeyTime) { + std::unique_ptr mock_env(new MockTimeEnv(Env::Default())); + uint64_t oldest_key_time = 0; + Options options; + options.env = mock_env.get(); + + // "rocksdb.estimate-oldest-key-time" only available to fifo compaction. + mock_env->set_current_time(100); + for (auto compaction : {kCompactionStyleLevel, kCompactionStyleUniversal, + kCompactionStyleNone}) { + options.compaction_style = compaction; + options.create_if_missing = true; + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "bar")); + ASSERT_FALSE(dbfull()->GetIntProperty( + DB::Properties::kEstimateOldestKeyTime, &oldest_key_time)); + } + + options.compaction_style = kCompactionStyleFIFO; + options.ttl = 300; + options.compaction_options_fifo.allow_compaction = false; + DestroyAndReopen(options); + + mock_env->set_current_time(100); + ASSERT_OK(Put("k1", "v1")); + ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime, + &oldest_key_time)); + ASSERT_EQ(100, oldest_key_time); + ASSERT_OK(Flush()); + ASSERT_EQ("1", FilesPerLevel()); + ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime, + &oldest_key_time)); + ASSERT_EQ(100, oldest_key_time); + + mock_env->set_current_time(200); + ASSERT_OK(Put("k2", "v2")); + ASSERT_OK(Flush()); + ASSERT_EQ("2", FilesPerLevel()); + ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime, + &oldest_key_time)); + ASSERT_EQ(100, oldest_key_time); + + mock_env->set_current_time(300); + ASSERT_OK(Put("k3", "v3")); + ASSERT_OK(Flush()); + ASSERT_EQ("3", FilesPerLevel()); + ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime, + &oldest_key_time)); + ASSERT_EQ(100, oldest_key_time); + + mock_env->set_current_time(450); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("2", FilesPerLevel()); + ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime, + &oldest_key_time)); + ASSERT_EQ(200, oldest_key_time); + + mock_env->set_current_time(550); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("1", FilesPerLevel()); + ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime, + &oldest_key_time)); + ASSERT_EQ(300, oldest_key_time); + + mock_env->set_current_time(650); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("", FilesPerLevel()); + ASSERT_FALSE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime, + &oldest_key_time)); + + // Close before mock_env destructs. + Close(); +} + +TEST_F(DBPropertiesTest, SstFilesSize) { + struct TestListener : public EventListener { + void OnCompactionCompleted(DB* db, + const CompactionJobInfo& /*info*/) override { + assert(callback_triggered == false); + assert(size_before_compaction > 0); + callback_triggered = true; + uint64_t total_sst_size = 0; + uint64_t live_sst_size = 0; + bool ok = db->GetIntProperty(DB::Properties::kTotalSstFilesSize, + &total_sst_size); + ASSERT_TRUE(ok); + // total_sst_size include files before and after compaction. + ASSERT_GT(total_sst_size, size_before_compaction); + ok = + db->GetIntProperty(DB::Properties::kLiveSstFilesSize, &live_sst_size); + ASSERT_TRUE(ok); + // live_sst_size only include files after compaction. + ASSERT_GT(live_sst_size, 0); + ASSERT_LT(live_sst_size, size_before_compaction); + } + + uint64_t size_before_compaction = 0; + bool callback_triggered = false; + }; + std::shared_ptr listener = std::make_shared(); + + Options options; + options.disable_auto_compactions = true; + options.listeners.push_back(listener); + Reopen(options); + + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put("key" + ToString(i), std::string(1000, 'v'))); + } + ASSERT_OK(Flush()); + for (int i = 0; i < 5; i++) { + ASSERT_OK(Delete("key" + ToString(i))); + } + ASSERT_OK(Flush()); + uint64_t sst_size; + bool ok = db_->GetIntProperty(DB::Properties::kTotalSstFilesSize, &sst_size); + ASSERT_TRUE(ok); + ASSERT_GT(sst_size, 0); + listener->size_before_compaction = sst_size; + // Compact to clean all keys and trigger listener. + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_TRUE(listener->callback_triggered); +} + +TEST_F(DBPropertiesTest, MinObsoleteSstNumberToKeep) { + class TestListener : public EventListener { + public: + void OnTableFileCreated(const TableFileCreationInfo& info) override { + if (info.reason == TableFileCreationReason::kCompaction) { + // Verify the property indicates that SSTs created by a running + // compaction cannot be deleted. + uint64_t created_file_num; + FileType created_file_type; + std::string filename = + info.file_path.substr(info.file_path.rfind('/') + 1); + ASSERT_TRUE( + ParseFileName(filename, &created_file_num, &created_file_type)); + ASSERT_EQ(kTableFile, created_file_type); + + uint64_t keep_sst_lower_bound; + ASSERT_TRUE( + db_->GetIntProperty(DB::Properties::kMinObsoleteSstNumberToKeep, + &keep_sst_lower_bound)); + + ASSERT_LE(keep_sst_lower_bound, created_file_num); + validated_ = true; + } + } + + void SetDB(DB* db) { db_ = db; } + + int GetNumCompactions() { return num_compactions_; } + + // True if we've verified the property for at least one output file + bool Validated() { return validated_; } + + private: + int num_compactions_ = 0; + bool validated_ = false; + DB* db_ = nullptr; + }; + + const int kNumL0Files = 4; + + std::shared_ptr listener = std::make_shared(); + + Options options = CurrentOptions(); + options.listeners.push_back(listener); + options.level0_file_num_compaction_trigger = kNumL0Files; + DestroyAndReopen(options); + listener->SetDB(db_); + + for (int i = 0; i < kNumL0Files; ++i) { + // Make sure they overlap in keyspace to prevent trivial move + Put("key1", "val"); + Put("key2", "val"); + Flush(); + } + dbfull()->TEST_WaitForCompact(); + ASSERT_TRUE(listener->Validated()); +} + +TEST_F(DBPropertiesTest, BlockCacheProperties) { + Options options; + uint64_t value; + + // Block cache properties are not available for tables other than + // block-based table. + options.table_factory.reset(NewPlainTableFactory()); + Reopen(options); + ASSERT_FALSE( + db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value)); + ASSERT_FALSE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value)); + ASSERT_FALSE( + db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value)); + + options.table_factory.reset(NewCuckooTableFactory()); + Reopen(options); + ASSERT_FALSE( + db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value)); + ASSERT_FALSE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value)); + ASSERT_FALSE( + db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value)); + + // Block cache properties are not available if block cache is not used. + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + ASSERT_FALSE( + db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value)); + ASSERT_FALSE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value)); + ASSERT_FALSE( + db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value)); + + // Test with empty block cache. + constexpr size_t kCapacity = 100; + LRUCacheOptions co; + co.capacity = kCapacity; + co.num_shard_bits = 0; + co.metadata_charge_policy = kDontChargeCacheMetadata; + auto block_cache = NewLRUCache(co); + table_options.block_cache = block_cache; + table_options.no_block_cache = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value)); + ASSERT_EQ(kCapacity, value); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value)); + ASSERT_EQ(0, value); + ASSERT_TRUE( + db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value)); + ASSERT_EQ(0, value); + + // Insert unpinned item to the cache and check size. + constexpr size_t kSize1 = 50; + block_cache->Insert("item1", nullptr /*value*/, kSize1, nullptr /*deleter*/); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value)); + ASSERT_EQ(kCapacity, value); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value)); + ASSERT_EQ(kSize1, value); + ASSERT_TRUE( + db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value)); + ASSERT_EQ(0, value); + + // Insert pinned item to the cache and check size. + constexpr size_t kSize2 = 30; + Cache::Handle* item2 = nullptr; + block_cache->Insert("item2", nullptr /*value*/, kSize2, nullptr /*deleter*/, + &item2); + ASSERT_NE(nullptr, item2); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value)); + ASSERT_EQ(kCapacity, value); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value)); + ASSERT_EQ(kSize1 + kSize2, value); + ASSERT_TRUE( + db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value)); + ASSERT_EQ(kSize2, value); + + // Insert another pinned item to make the cache over-sized. + constexpr size_t kSize3 = 80; + Cache::Handle* item3 = nullptr; + block_cache->Insert("item3", nullptr /*value*/, kSize3, nullptr /*deleter*/, + &item3); + ASSERT_NE(nullptr, item2); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value)); + ASSERT_EQ(kCapacity, value); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value)); + // Item 1 is evicted. + ASSERT_EQ(kSize2 + kSize3, value); + ASSERT_TRUE( + db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value)); + ASSERT_EQ(kSize2 + kSize3, value); + + // Check size after release. + block_cache->Release(item2); + block_cache->Release(item3); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value)); + ASSERT_EQ(kCapacity, value); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value)); + // item2 will be evicted, while item3 remain in cache after release. + ASSERT_EQ(kSize3, value); + ASSERT_TRUE( + db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value)); + ASSERT_EQ(0, value); +} + +#endif // ROCKSDB_LITE +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_range_del_test.cc b/src/rocksdb/db/db_range_del_test.cc new file mode 100644 index 000000000..15225875d --- /dev/null +++ b/src/rocksdb/db/db_range_del_test.cc @@ -0,0 +1,1660 @@ +// Copyright (c) 2016-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "test_util/testutil.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { + +class DBRangeDelTest : public DBTestBase { + public: + DBRangeDelTest() : DBTestBase("/db_range_del_test") {} + + std::string GetNumericStr(int key) { + uint64_t uint64_key = static_cast(key); + std::string str; + str.resize(8); + memcpy(&str[0], static_cast(&uint64_key), 8); + return str; + } +}; + +// PlainTableFactory, WriteBatchWithIndex, and NumTableFilesAtLevel() are not +// supported in ROCKSDB_LITE +#ifndef ROCKSDB_LITE +TEST_F(DBRangeDelTest, NonBlockBasedTableNotSupported) { + // TODO: figure out why MmapReads trips the iterator pinning assertion in + // RangeDelAggregator. Ideally it would be supported; otherwise it should at + // least be explicitly unsupported. + for (auto config : {kPlainTableAllBytesPrefix, /* kWalDirAndMmapReads */}) { + option_config_ = config; + DestroyAndReopen(CurrentOptions()); + ASSERT_TRUE(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + "dr1", "dr1") + .IsNotSupported()); + } +} + +TEST_F(DBRangeDelTest, WriteBatchWithIndexNotSupported) { + WriteBatchWithIndex indexedBatch{}; + ASSERT_TRUE(indexedBatch.DeleteRange(db_->DefaultColumnFamily(), "dr1", "dr1") + .IsNotSupported()); + ASSERT_TRUE(indexedBatch.DeleteRange("dr1", "dr1").IsNotSupported()); +} + +TEST_F(DBRangeDelTest, FlushOutputHasOnlyRangeTombstones) { + do { + DestroyAndReopen(CurrentOptions()); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + "dr1", "dr2")); + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + } while (ChangeOptions(kRangeDelSkipConfigs)); +} + +TEST_F(DBRangeDelTest, CompactionOutputHasOnlyRangeTombstone) { + do { + Options opts = CurrentOptions(); + opts.disable_auto_compactions = true; + opts.statistics = CreateDBStatistics(); + DestroyAndReopen(opts); + + // snapshot protects range tombstone from dropping due to becoming obsolete. + const Snapshot* snapshot = db_->GetSnapshot(); + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"); + db_->Flush(FlushOptions()); + + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + ASSERT_EQ(0, NumTableFilesAtLevel(1)); + dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + ASSERT_EQ(0, TestGetTickerCount(opts, COMPACTION_RANGE_DEL_DROP_OBSOLETE)); + db_->ReleaseSnapshot(snapshot); + // Skip cuckoo memtables, which do not support snapshots. Skip non-leveled + // compactions as the above assertions about the number of files in a level + // do not hold true. + } while (ChangeOptions(kRangeDelSkipConfigs | kSkipUniversalCompaction | + kSkipFIFOCompaction)); +} + +TEST_F(DBRangeDelTest, CompactionOutputFilesExactlyFilled) { + // regression test for exactly filled compaction output files. Previously + // another file would be generated containing all range deletions, which + // could invalidate the non-overlapping file boundary invariant. + const int kNumPerFile = 4, kNumFiles = 2, kFileBytes = 9 << 10; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.level0_file_num_compaction_trigger = kNumFiles; + options.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile)); + options.num_levels = 2; + options.target_file_size_base = kFileBytes; + BlockBasedTableOptions table_options; + table_options.block_size_deviation = 50; // each block holds two keys + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + + // snapshot protects range tombstone from dropping due to becoming obsolete. + const Snapshot* snapshot = db_->GetSnapshot(); + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), Key(1)); + + Random rnd(301); + for (int i = 0; i < kNumFiles; ++i) { + std::vector values; + // Write 12K (4 values, each 3K) + for (int j = 0; j < kNumPerFile; j++) { + values.push_back(RandomString(&rnd, 3 << 10)); + ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j])); + if (j == 0 && i > 0) { + dbfull()->TEST_WaitForFlushMemTable(); + } + } + } + // put extra key to trigger final flush + ASSERT_OK(Put("", "")); + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0)); + ASSERT_EQ(0, NumTableFilesAtLevel(1)); + + dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_EQ(2, NumTableFilesAtLevel(1)); + db_->ReleaseSnapshot(snapshot); +} + +TEST_F(DBRangeDelTest, MaxCompactionBytesCutsOutputFiles) { + // Ensures range deletion spanning multiple compaction output files that are + // cut by max_compaction_bytes will have non-overlapping key-ranges. + // https://github.com/facebook/rocksdb/issues/1778 + const int kNumFiles = 2, kNumPerFile = 1 << 8, kBytesPerVal = 1 << 12; + Options opts = CurrentOptions(); + opts.comparator = test::Uint64Comparator(); + opts.disable_auto_compactions = true; + opts.level0_file_num_compaction_trigger = kNumFiles; + opts.max_compaction_bytes = kNumPerFile * kBytesPerVal; + opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile)); + // Want max_compaction_bytes to trigger the end of compaction output file, not + // target_file_size_base, so make the latter much bigger + opts.target_file_size_base = 100 * opts.max_compaction_bytes; + Reopen(opts); + + // snapshot protects range tombstone from dropping due to becoming obsolete. + const Snapshot* snapshot = db_->GetSnapshot(); + + // It spans the whole key-range, thus will be included in all output files + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + GetNumericStr(0), + GetNumericStr(kNumFiles * kNumPerFile - 1))); + Random rnd(301); + for (int i = 0; i < kNumFiles; ++i) { + std::vector values; + // Write 1MB (256 values, each 4K) + for (int j = 0; j < kNumPerFile; j++) { + values.push_back(RandomString(&rnd, kBytesPerVal)); + ASSERT_OK(Put(GetNumericStr(kNumPerFile * i + j), values[j])); + } + // extra entry to trigger SpecialSkipListFactory's flush + ASSERT_OK(Put(GetNumericStr(kNumPerFile), "")); + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(i + 1, NumTableFilesAtLevel(0)); + } + + dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_GE(NumTableFilesAtLevel(1), 2); + + std::vector> files; + dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files); + + for (size_t i = 0; i < files[1].size() - 1; ++i) { + ASSERT_TRUE(InternalKeyComparator(opts.comparator) + .Compare(files[1][i].largest, files[1][i + 1].smallest) < + 0); + } + db_->ReleaseSnapshot(snapshot); +} + +TEST_F(DBRangeDelTest, SentinelsOmittedFromOutputFile) { + // Regression test for bug where sentinel range deletions (i.e., ones with + // sequence number of zero) were included in output files. + // snapshot protects range tombstone from dropping due to becoming obsolete. + const Snapshot* snapshot = db_->GetSnapshot(); + + // gaps between ranges creates sentinels in our internal representation + std::vector> range_dels = {{"a", "b"}, {"c", "d"}, {"e", "f"}}; + for (const auto& range_del : range_dels) { + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + range_del.first, range_del.second)); + } + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + + std::vector> files; + dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files); + ASSERT_GT(files[0][0].fd.smallest_seqno, 0); + + db_->ReleaseSnapshot(snapshot); +} + +TEST_F(DBRangeDelTest, FlushRangeDelsSameStartKey) { + db_->Put(WriteOptions(), "b1", "val"); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "c")); + db_->Put(WriteOptions(), "b2", "val"); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "b")); + // first iteration verifies query correctness in memtable, second verifies + // query correctness for a single SST file + for (int i = 0; i < 2; ++i) { + if (i > 0) { + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + } + std::string value; + ASSERT_TRUE(db_->Get(ReadOptions(), "b1", &value).IsNotFound()); + ASSERT_OK(db_->Get(ReadOptions(), "b2", &value)); + } +} + +TEST_F(DBRangeDelTest, CompactRangeDelsSameStartKey) { + db_->Put(WriteOptions(), "unused", "val"); // prevents empty after compaction + db_->Put(WriteOptions(), "b1", "val"); + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "c")); + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "b")); + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_EQ(3, NumTableFilesAtLevel(0)); + + for (int i = 0; i < 2; ++i) { + if (i > 0) { + dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + } + std::string value; + ASSERT_TRUE(db_->Get(ReadOptions(), "b1", &value).IsNotFound()); + } +} +#endif // ROCKSDB_LITE + +TEST_F(DBRangeDelTest, FlushRemovesCoveredKeys) { + const int kNum = 300, kRangeBegin = 50, kRangeEnd = 250; + Options opts = CurrentOptions(); + opts.comparator = test::Uint64Comparator(); + Reopen(opts); + + // Write a third before snapshot, a third between snapshot and tombstone, and + // a third after the tombstone. Keys older than snapshot or newer than the + // tombstone should be preserved. + const Snapshot* snapshot = nullptr; + for (int i = 0; i < kNum; ++i) { + if (i == kNum / 3) { + snapshot = db_->GetSnapshot(); + } else if (i == 2 * kNum / 3) { + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd)); + } + db_->Put(WriteOptions(), GetNumericStr(i), "val"); + } + db_->Flush(FlushOptions()); + + for (int i = 0; i < kNum; ++i) { + ReadOptions read_opts; + read_opts.ignore_range_deletions = true; + std::string value; + if (i < kRangeBegin || i > kRangeEnd || i < kNum / 3 || i >= 2 * kNum / 3) { + ASSERT_OK(db_->Get(read_opts, GetNumericStr(i), &value)); + } else { + ASSERT_TRUE(db_->Get(read_opts, GetNumericStr(i), &value).IsNotFound()); + } + } + db_->ReleaseSnapshot(snapshot); +} + +// NumTableFilesAtLevel() is not supported in ROCKSDB_LITE +#ifndef ROCKSDB_LITE +TEST_F(DBRangeDelTest, CompactionRemovesCoveredKeys) { + const int kNumPerFile = 100, kNumFiles = 4; + Options opts = CurrentOptions(); + opts.comparator = test::Uint64Comparator(); + opts.disable_auto_compactions = true; + opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile)); + opts.num_levels = 2; + opts.statistics = CreateDBStatistics(); + Reopen(opts); + + for (int i = 0; i < kNumFiles; ++i) { + if (i > 0) { + // range tombstone covers first half of the previous file + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + GetNumericStr((i - 1) * kNumPerFile), + GetNumericStr((i - 1) * kNumPerFile + kNumPerFile / 2)); + } + // Make sure a given key appears in each file so compaction won't be able to + // use trivial move, which would happen if the ranges were non-overlapping. + // Also, we need an extra element since flush is only triggered when the + // number of keys is one greater than SpecialSkipListFactory's limit. + // We choose a key outside the key-range used by the test to avoid conflict. + db_->Put(WriteOptions(), GetNumericStr(kNumPerFile * kNumFiles), "val"); + + for (int j = 0; j < kNumPerFile; ++j) { + db_->Put(WriteOptions(), GetNumericStr(i * kNumPerFile + j), "val"); + } + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(i + 1, NumTableFilesAtLevel(0)); + } + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_GT(NumTableFilesAtLevel(1), 0); + ASSERT_EQ((kNumFiles - 1) * kNumPerFile / 2, + TestGetTickerCount(opts, COMPACTION_KEY_DROP_RANGE_DEL)); + + for (int i = 0; i < kNumFiles; ++i) { + for (int j = 0; j < kNumPerFile; ++j) { + ReadOptions read_opts; + read_opts.ignore_range_deletions = true; + std::string value; + if (i == kNumFiles - 1 || j >= kNumPerFile / 2) { + ASSERT_OK( + db_->Get(read_opts, GetNumericStr(i * kNumPerFile + j), &value)); + } else { + ASSERT_TRUE( + db_->Get(read_opts, GetNumericStr(i * kNumPerFile + j), &value) + .IsNotFound()); + } + } + } +} + +TEST_F(DBRangeDelTest, ValidLevelSubcompactionBoundaries) { + const int kNumPerFile = 100, kNumFiles = 4, kFileBytes = 100 << 10; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.level0_file_num_compaction_trigger = kNumFiles; + options.max_bytes_for_level_base = 2 * kFileBytes; + options.max_subcompactions = 4; + options.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile)); + options.num_levels = 3; + options.target_file_size_base = kFileBytes; + options.target_file_size_multiplier = 1; + Reopen(options); + + Random rnd(301); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < kNumFiles; ++j) { + if (i > 0) { + // delete [95,105) in two files, [295,305) in next two + int mid = (j + (1 - j % 2)) * kNumPerFile; + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(mid - 5), Key(mid + 5)); + } + std::vector values; + // Write 100KB (100 values, each 1K) + for (int k = 0; k < kNumPerFile; k++) { + values.push_back(RandomString(&rnd, 990)); + ASSERT_OK(Put(Key(j * kNumPerFile + k), values[k])); + } + // put extra key to trigger flush + ASSERT_OK(Put("", "")); + dbfull()->TEST_WaitForFlushMemTable(); + if (j < kNumFiles - 1) { + // background compaction may happen early for kNumFiles'th file + ASSERT_EQ(NumTableFilesAtLevel(0), j + 1); + } + if (j == options.level0_file_num_compaction_trigger - 1) { + // When i == 1, compaction will output some files to L1, at which point + // L1 is not bottommost so range deletions cannot be compacted away. The + // new L1 files must be generated with non-overlapping key ranges even + // though multiple subcompactions see the same ranges deleted, else an + // assertion will fail. + // + // Only enable auto-compactions when we're ready; otherwise, the + // oversized L0 (relative to base_level) causes the compaction to run + // earlier. + ASSERT_OK(db_->EnableAutoCompaction({db_->DefaultColumnFamily()})); + dbfull()->TEST_WaitForCompact(); + ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(), + {{"disable_auto_compactions", "true"}})); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_GT(NumTableFilesAtLevel(1), 0); + ASSERT_GT(NumTableFilesAtLevel(2), 0); + } + } + } +} + +TEST_F(DBRangeDelTest, ValidUniversalSubcompactionBoundaries) { + const int kNumPerFile = 100, kFilesPerLevel = 4, kNumLevels = 4; + Options options = CurrentOptions(); + options.compaction_options_universal.min_merge_width = kFilesPerLevel; + options.compaction_options_universal.max_merge_width = kFilesPerLevel; + options.compaction_options_universal.size_ratio = 10; + options.compaction_style = kCompactionStyleUniversal; + options.level0_file_num_compaction_trigger = kFilesPerLevel; + options.max_subcompactions = 4; + options.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile)); + options.num_levels = kNumLevels; + options.target_file_size_base = kNumPerFile << 10; + options.target_file_size_multiplier = 1; + Reopen(options); + + Random rnd(301); + for (int i = 0; i < kNumLevels - 1; ++i) { + for (int j = 0; j < kFilesPerLevel; ++j) { + if (i == kNumLevels - 2) { + // insert range deletions [95,105) in two files, [295,305) in next two + // to prepare L1 for later manual compaction. + int mid = (j + (1 - j % 2)) * kNumPerFile; + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(mid - 5), Key(mid + 5)); + } + std::vector values; + // Write 100KB (100 values, each 1K) + for (int k = 0; k < kNumPerFile; k++) { + values.push_back(RandomString(&rnd, 990)); + ASSERT_OK(Put(Key(j * kNumPerFile + k), values[k])); + } + // put extra key to trigger flush + ASSERT_OK(Put("", "")); + dbfull()->TEST_WaitForFlushMemTable(); + if (j < kFilesPerLevel - 1) { + // background compaction may happen early for kFilesPerLevel'th file + ASSERT_EQ(NumTableFilesAtLevel(0), j + 1); + } + } + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_GT(NumTableFilesAtLevel(kNumLevels - 1 - i), kFilesPerLevel - 1); + } + // Now L1-L3 are full, when we compact L1->L2 we should see (1) subcompactions + // happen since input level > 0; (2) range deletions are not dropped since + // output level is not bottommost. If no file boundary assertion fails, that + // probably means universal compaction + subcompaction + range deletion are + // compatible. + ASSERT_OK(dbfull()->RunManualCompaction( + reinterpret_cast(db_->DefaultColumnFamily()) + ->cfd(), + 1 /* input_level */, 2 /* output_level */, CompactRangeOptions(), + nullptr /* begin */, nullptr /* end */, true /* exclusive */, + true /* disallow_trivial_move */, + port::kMaxUint64 /* max_file_num_to_ignore */)); +} +#endif // ROCKSDB_LITE + +TEST_F(DBRangeDelTest, CompactionRemovesCoveredMergeOperands) { + const int kNumPerFile = 3, kNumFiles = 3; + Options opts = CurrentOptions(); + opts.disable_auto_compactions = true; + opts.memtable_factory.reset(new SpecialSkipListFactory(2 * kNumPerFile)); + opts.merge_operator = MergeOperators::CreateUInt64AddOperator(); + opts.num_levels = 2; + Reopen(opts); + + // Iterates kNumFiles * kNumPerFile + 1 times since flushing the last file + // requires an extra entry. + for (int i = 0; i <= kNumFiles * kNumPerFile; ++i) { + if (i % kNumPerFile == 0 && i / kNumPerFile == kNumFiles - 1) { + // Delete merge operands from all but the last file + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "key", + "key_"); + } + std::string val; + PutFixed64(&val, i); + db_->Merge(WriteOptions(), "key", val); + // we need to prevent trivial move using Puts so compaction will actually + // process the merge operands. + db_->Put(WriteOptions(), "prevent_trivial_move", ""); + if (i > 0 && i % kNumPerFile == 0) { + dbfull()->TEST_WaitForFlushMemTable(); + } + } + + ReadOptions read_opts; + read_opts.ignore_range_deletions = true; + std::string expected, actual; + ASSERT_OK(db_->Get(read_opts, "key", &actual)); + PutFixed64(&expected, 45); // 1+2+...+9 + ASSERT_EQ(expected, actual); + + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + + expected.clear(); + ASSERT_OK(db_->Get(read_opts, "key", &actual)); + uint64_t tmp; + Slice tmp2(actual); + GetFixed64(&tmp2, &tmp); + PutFixed64(&expected, 30); // 6+7+8+9 (earlier operands covered by tombstone) + ASSERT_EQ(expected, actual); +} + +TEST_F(DBRangeDelTest, PutDeleteRangeMergeFlush) { + // Test the sequence of operations: (1) Put, (2) DeleteRange, (3) Merge, (4) + // Flush. The `CompactionIterator` previously had a bug where we forgot to + // check for covering range tombstones when processing the (1) Put, causing + // it to reappear after the flush. + Options opts = CurrentOptions(); + opts.merge_operator = MergeOperators::CreateUInt64AddOperator(); + Reopen(opts); + + std::string val; + PutFixed64(&val, 1); + ASSERT_OK(db_->Put(WriteOptions(), "key", val)); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + "key", "key_")); + ASSERT_OK(db_->Merge(WriteOptions(), "key", val)); + ASSERT_OK(db_->Flush(FlushOptions())); + + ReadOptions read_opts; + std::string expected, actual; + ASSERT_OK(db_->Get(read_opts, "key", &actual)); + PutFixed64(&expected, 1); + ASSERT_EQ(expected, actual); +} + +// NumTableFilesAtLevel() is not supported in ROCKSDB_LITE +#ifndef ROCKSDB_LITE +TEST_F(DBRangeDelTest, ObsoleteTombstoneCleanup) { + // During compaction to bottommost level, verify range tombstones older than + // the oldest snapshot are removed, while others are preserved. + Options opts = CurrentOptions(); + opts.disable_auto_compactions = true; + opts.num_levels = 2; + opts.statistics = CreateDBStatistics(); + Reopen(opts); + + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr1", + "dr10"); // obsolete after compaction + db_->Put(WriteOptions(), "key", "val"); + db_->Flush(FlushOptions()); + const Snapshot* snapshot = db_->GetSnapshot(); + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr2", + "dr20"); // protected by snapshot + db_->Put(WriteOptions(), "key", "val"); + db_->Flush(FlushOptions()); + + ASSERT_EQ(2, NumTableFilesAtLevel(0)); + ASSERT_EQ(0, NumTableFilesAtLevel(1)); + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + ASSERT_EQ(1, TestGetTickerCount(opts, COMPACTION_RANGE_DEL_DROP_OBSOLETE)); + + db_->ReleaseSnapshot(snapshot); +} + +TEST_F(DBRangeDelTest, TableEvictedDuringScan) { + // The RangeDelAggregator holds pointers into range deletion blocks created by + // table readers. This test ensures the aggregator can still access those + // blocks even if it outlives the table readers that created them. + // + // DBIter always keeps readers open for L0 files. So, in order to test + // aggregator outliving reader, we need to have deletions in L1 files, which + // are opened/closed on-demand during the scan. This is accomplished by + // setting kNumRanges > level0_stop_writes_trigger, which prevents deletions + // from all lingering in L0 (there is at most one range deletion per L0 file). + // + // The first L1 file will contain a range deletion since its begin key is 0. + // SeekToFirst() references that table's reader and adds its range tombstone + // to the aggregator. Upon advancing beyond that table's key-range via Next(), + // the table reader will be unreferenced by the iterator. Since we manually + // call Evict() on all readers before the full scan, this unreference causes + // the reader's refcount to drop to zero and thus be destroyed. + // + // When it is destroyed, we do not remove its range deletions from the + // aggregator. So, subsequent calls to Next() must be able to use these + // deletions to decide whether a key is covered. This will work as long as + // the aggregator properly references the range deletion block. + const int kNum = 25, kRangeBegin = 0, kRangeEnd = 7, kNumRanges = 5; + Options opts = CurrentOptions(); + opts.comparator = test::Uint64Comparator(); + opts.level0_file_num_compaction_trigger = 4; + opts.level0_stop_writes_trigger = 4; + opts.memtable_factory.reset(new SpecialSkipListFactory(1)); + opts.num_levels = 2; + BlockBasedTableOptions bbto; + bbto.cache_index_and_filter_blocks = true; + bbto.block_cache = NewLRUCache(8 << 20); + opts.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(opts); + + // Hold a snapshot so range deletions can't become obsolete during compaction + // to bottommost level (i.e., L1). + const Snapshot* snapshot = db_->GetSnapshot(); + for (int i = 0; i < kNum; ++i) { + db_->Put(WriteOptions(), GetNumericStr(i), "val"); + if (i > 0) { + dbfull()->TEST_WaitForFlushMemTable(); + } + if (i >= kNum / 2 && i < kNum / 2 + kNumRanges) { + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd)); + } + } + // Must be > 1 so the first L1 file can be closed before scan finishes + dbfull()->TEST_WaitForCompact(); + ASSERT_GT(NumTableFilesAtLevel(1), 1); + std::vector file_numbers = ListTableFiles(env_, dbname_); + + ReadOptions read_opts; + auto* iter = db_->NewIterator(read_opts); + int expected = kRangeEnd; + iter->SeekToFirst(); + for (auto file_number : file_numbers) { + // This puts table caches in the state of being externally referenced only + // so they are destroyed immediately upon iterator unreferencing. + TableCache::Evict(dbfull()->TEST_table_cache(), file_number); + } + for (; iter->Valid(); iter->Next()) { + ASSERT_EQ(GetNumericStr(expected), iter->key()); + ++expected; + // Keep clearing block cache's LRU so range deletion block can be freed as + // soon as its refcount drops to zero. + bbto.block_cache->EraseUnRefEntries(); + } + ASSERT_EQ(kNum, expected); + delete iter; + db_->ReleaseSnapshot(snapshot); +} + +TEST_F(DBRangeDelTest, GetCoveredKeyFromMutableMemtable) { + do { + DestroyAndReopen(CurrentOptions()); + db_->Put(WriteOptions(), "key", "val"); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); + + ReadOptions read_opts; + std::string value; + ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound()); + } while (ChangeOptions(kRangeDelSkipConfigs)); +} + +TEST_F(DBRangeDelTest, GetCoveredKeyFromImmutableMemtable) { + do { + Options opts = CurrentOptions(); + opts.max_write_buffer_number = 3; + opts.min_write_buffer_number_to_merge = 2; + // SpecialSkipListFactory lets us specify maximum number of elements the + // memtable can hold. It switches the active memtable to immutable (flush is + // prevented by the above options) upon inserting an element that would + // overflow the memtable. + opts.memtable_factory.reset(new SpecialSkipListFactory(1)); + DestroyAndReopen(opts); + + db_->Put(WriteOptions(), "key", "val"); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); + db_->Put(WriteOptions(), "blah", "val"); + + ReadOptions read_opts; + std::string value; + ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound()); + } while (ChangeOptions(kRangeDelSkipConfigs)); +} + +TEST_F(DBRangeDelTest, GetCoveredKeyFromSst) { + do { + DestroyAndReopen(CurrentOptions()); + db_->Put(WriteOptions(), "key", "val"); + // snapshot prevents key from being deleted during flush + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); + ASSERT_OK(db_->Flush(FlushOptions())); + + ReadOptions read_opts; + std::string value; + ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound()); + db_->ReleaseSnapshot(snapshot); + } while (ChangeOptions(kRangeDelSkipConfigs)); +} + +TEST_F(DBRangeDelTest, GetCoveredMergeOperandFromMemtable) { + const int kNumMergeOps = 10; + Options opts = CurrentOptions(); + opts.merge_operator = MergeOperators::CreateUInt64AddOperator(); + Reopen(opts); + + for (int i = 0; i < kNumMergeOps; ++i) { + std::string val; + PutFixed64(&val, i); + db_->Merge(WriteOptions(), "key", val); + if (i == kNumMergeOps / 2) { + // deletes [0, 5] + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "key", + "key_"); + } + } + + ReadOptions read_opts; + std::string expected, actual; + ASSERT_OK(db_->Get(read_opts, "key", &actual)); + PutFixed64(&expected, 30); // 6+7+8+9 + ASSERT_EQ(expected, actual); + + expected.clear(); + read_opts.ignore_range_deletions = true; + ASSERT_OK(db_->Get(read_opts, "key", &actual)); + PutFixed64(&expected, 45); // 0+1+2+...+9 + ASSERT_EQ(expected, actual); +} + +TEST_F(DBRangeDelTest, GetIgnoresRangeDeletions) { + Options opts = CurrentOptions(); + opts.max_write_buffer_number = 4; + opts.min_write_buffer_number_to_merge = 3; + opts.memtable_factory.reset(new SpecialSkipListFactory(1)); + Reopen(opts); + + db_->Put(WriteOptions(), "sst_key", "val"); + // snapshot prevents key from being deleted during flush + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); + ASSERT_OK(db_->Flush(FlushOptions())); + db_->Put(WriteOptions(), "imm_key", "val"); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); + db_->Put(WriteOptions(), "mem_key", "val"); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); + + ReadOptions read_opts; + read_opts.ignore_range_deletions = true; + for (std::string key : {"sst_key", "imm_key", "mem_key"}) { + std::string value; + ASSERT_OK(db_->Get(read_opts, key, &value)); + } + db_->ReleaseSnapshot(snapshot); +} + +TEST_F(DBRangeDelTest, IteratorRemovesCoveredKeys) { + const int kNum = 200, kRangeBegin = 50, kRangeEnd = 150, kNumPerFile = 25; + Options opts = CurrentOptions(); + opts.comparator = test::Uint64Comparator(); + opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile)); + Reopen(opts); + + // Write half of the keys before the tombstone and half after the tombstone. + // Only covered keys (i.e., within the range and older than the tombstone) + // should be deleted. + for (int i = 0; i < kNum; ++i) { + if (i == kNum / 2) { + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd)); + } + db_->Put(WriteOptions(), GetNumericStr(i), "val"); + } + ReadOptions read_opts; + auto* iter = db_->NewIterator(read_opts); + + int expected = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_EQ(GetNumericStr(expected), iter->key()); + if (expected == kRangeBegin - 1) { + expected = kNum / 2; + } else { + ++expected; + } + } + ASSERT_EQ(kNum, expected); + delete iter; +} + +TEST_F(DBRangeDelTest, IteratorOverUserSnapshot) { + const int kNum = 200, kRangeBegin = 50, kRangeEnd = 150, kNumPerFile = 25; + Options opts = CurrentOptions(); + opts.comparator = test::Uint64Comparator(); + opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile)); + Reopen(opts); + + const Snapshot* snapshot = nullptr; + // Put a snapshot before the range tombstone, verify an iterator using that + // snapshot sees all inserted keys. + for (int i = 0; i < kNum; ++i) { + if (i == kNum / 2) { + snapshot = db_->GetSnapshot(); + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd)); + } + db_->Put(WriteOptions(), GetNumericStr(i), "val"); + } + ReadOptions read_opts; + read_opts.snapshot = snapshot; + auto* iter = db_->NewIterator(read_opts); + + int expected = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_EQ(GetNumericStr(expected), iter->key()); + ++expected; + } + ASSERT_EQ(kNum / 2, expected); + delete iter; + db_->ReleaseSnapshot(snapshot); +} + +TEST_F(DBRangeDelTest, IteratorIgnoresRangeDeletions) { + Options opts = CurrentOptions(); + opts.max_write_buffer_number = 4; + opts.min_write_buffer_number_to_merge = 3; + opts.memtable_factory.reset(new SpecialSkipListFactory(1)); + Reopen(opts); + + db_->Put(WriteOptions(), "sst_key", "val"); + // snapshot prevents key from being deleted during flush + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); + ASSERT_OK(db_->Flush(FlushOptions())); + db_->Put(WriteOptions(), "imm_key", "val"); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); + db_->Put(WriteOptions(), "mem_key", "val"); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); + + ReadOptions read_opts; + read_opts.ignore_range_deletions = true; + auto* iter = db_->NewIterator(read_opts); + int i = 0; + std::string expected[] = {"imm_key", "mem_key", "sst_key"}; + for (iter->SeekToFirst(); iter->Valid(); iter->Next(), ++i) { + std::string key; + ASSERT_EQ(expected[i], iter->key()); + } + ASSERT_EQ(3, i); + delete iter; + db_->ReleaseSnapshot(snapshot); +} + +#ifndef ROCKSDB_UBSAN_RUN +TEST_F(DBRangeDelTest, TailingIteratorRangeTombstoneUnsupported) { + db_->Put(WriteOptions(), "key", "val"); + // snapshot prevents key from being deleted during flush + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); + + // iterations check unsupported in memtable, l0, and then l1 + for (int i = 0; i < 3; ++i) { + ReadOptions read_opts; + read_opts.tailing = true; + auto* iter = db_->NewIterator(read_opts); + if (i == 2) { + // For L1+, iterators over files are created on-demand, so need seek + iter->SeekToFirst(); + } + ASSERT_TRUE(iter->status().IsNotSupported()); + delete iter; + if (i == 0) { + ASSERT_OK(db_->Flush(FlushOptions())); + } else if (i == 1) { + MoveFilesToLevel(1); + } + } + db_->ReleaseSnapshot(snapshot); +} + +#endif // !ROCKSDB_UBSAN_RUN + +TEST_F(DBRangeDelTest, SubcompactionHasEmptyDedicatedRangeDelFile) { + const int kNumFiles = 2, kNumKeysPerFile = 4; + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.disable_auto_compactions = true; + options.level0_file_num_compaction_trigger = kNumFiles; + options.max_subcompactions = 2; + options.num_levels = 2; + options.target_file_size_base = 4096; + Reopen(options); + + // need a L1 file for subcompaction to be triggered + ASSERT_OK( + db_->Put(WriteOptions(), db_->DefaultColumnFamily(), Key(0), "val")); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(1); + + // put enough keys to fill up the first subcompaction, and later range-delete + // them so that the first subcompaction outputs no key-values. In that case + // it'll consider making an SST file dedicated to range deletions. + for (int i = 0; i < kNumKeysPerFile; ++i) { + ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), Key(i), + std::string(1024, 'a'))); + } + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), + Key(kNumKeysPerFile))); + + // the above range tombstone can be dropped, so that one alone won't cause a + // dedicated file to be opened. We can make one protected by snapshot that + // must be considered. Make its range outside the first subcompaction's range + // to exercise the tricky part of the code. + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(kNumKeysPerFile + 1), + Key(kNumKeysPerFile + 2))); + ASSERT_OK(db_->Flush(FlushOptions())); + + ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0)); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + + db_->EnableAutoCompaction({db_->DefaultColumnFamily()}); + dbfull()->TEST_WaitForCompact(); + db_->ReleaseSnapshot(snapshot); +} + +TEST_F(DBRangeDelTest, MemtableBloomFilter) { + // regression test for #2743. the range delete tombstones in memtable should + // be added even when Get() skips searching due to its prefix bloom filter + const int kMemtableSize = 1 << 20; // 1MB + const int kMemtablePrefixFilterSize = 1 << 13; // 8KB + const int kNumKeys = 1000; + const int kPrefixLen = 8; + Options options = CurrentOptions(); + options.memtable_prefix_bloom_size_ratio = + static_cast(kMemtablePrefixFilterSize) / kMemtableSize; + options.prefix_extractor.reset( + ROCKSDB_NAMESPACE::NewFixedPrefixTransform(kPrefixLen)); + options.write_buffer_size = kMemtableSize; + Reopen(options); + + for (int i = 0; i < kNumKeys; ++i) { + ASSERT_OK(Put(Key(i), "val")); + } + Flush(); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), + Key(kNumKeys))); + for (int i = 0; i < kNumKeys; ++i) { + std::string value; + ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &value).IsNotFound()); + } +} + +TEST_F(DBRangeDelTest, CompactionTreatsSplitInputLevelDeletionAtomically) { + // This test originally verified that compaction treated files containing a + // split range deletion in the input level as an atomic unit. I.e., + // compacting any input-level file(s) containing a portion of the range + // deletion causes all other input-level files containing portions of that + // same range deletion to be included in the compaction. Range deletion + // tombstones are now truncated to sstable boundaries which removed the need + // for that behavior (which could lead to excessively large + // compactions). + const int kNumFilesPerLevel = 4, kValueBytes = 4 << 10; + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = kNumFilesPerLevel; + options.memtable_factory.reset( + new SpecialSkipListFactory(2 /* num_entries_flush */)); + options.target_file_size_base = kValueBytes; + // i == 0: CompactFiles + // i == 1: CompactRange + // i == 2: automatic compaction + for (int i = 0; i < 3; ++i) { + DestroyAndReopen(options); + + ASSERT_OK(Put(Key(0), "")); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(2); + ASSERT_EQ(1, NumTableFilesAtLevel(2)); + + // snapshot protects range tombstone from dropping due to becoming obsolete. + const Snapshot* snapshot = db_->GetSnapshot(); + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), + Key(2 * kNumFilesPerLevel)); + + Random rnd(301); + std::string value = RandomString(&rnd, kValueBytes); + for (int j = 0; j < kNumFilesPerLevel; ++j) { + // give files overlapping key-ranges to prevent trivial move + ASSERT_OK(Put(Key(j), value)); + ASSERT_OK(Put(Key(2 * kNumFilesPerLevel - 1 - j), value)); + if (j > 0) { + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(j, NumTableFilesAtLevel(0)); + } + } + // put extra key to trigger final flush + ASSERT_OK(Put("", "")); + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_EQ(kNumFilesPerLevel, NumTableFilesAtLevel(1)); + + ColumnFamilyMetaData meta; + db_->GetColumnFamilyMetaData(&meta); + if (i == 0) { + ASSERT_OK(db_->CompactFiles( + CompactionOptions(), {meta.levels[1].files[0].name}, 2 /* level */)); + ASSERT_EQ(0, NumTableFilesAtLevel(1)); + } else if (i == 1) { + auto begin_str = Key(0), end_str = Key(1); + Slice begin = begin_str, end = end_str; + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &begin, &end)); + ASSERT_EQ(3, NumTableFilesAtLevel(1)); + } else if (i == 2) { + ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(), + {{"max_bytes_for_level_base", "10000"}})); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + } + ASSERT_GT(NumTableFilesAtLevel(2), 0); + + db_->ReleaseSnapshot(snapshot); + } +} + +TEST_F(DBRangeDelTest, RangeTombstoneEndKeyAsSstableUpperBound) { + // Test the handling of the range-tombstone end-key as the + // upper-bound for an sstable. + + const int kNumFilesPerLevel = 2, kValueBytes = 4 << 10; + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = kNumFilesPerLevel; + options.memtable_factory.reset( + new SpecialSkipListFactory(2 /* num_entries_flush */)); + options.target_file_size_base = kValueBytes; + options.disable_auto_compactions = true; + + DestroyAndReopen(options); + + // Create an initial sstable at L2: + // [key000000#1,1, key000000#1,1] + ASSERT_OK(Put(Key(0), "")); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(2); + ASSERT_EQ(1, NumTableFilesAtLevel(2)); + + // A snapshot protects the range tombstone from dropping due to + // becoming obsolete. + const Snapshot* snapshot = db_->GetSnapshot(); + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(0), Key(2 * kNumFilesPerLevel)); + + // Create 2 additional sstables in L0. Note that the first sstable + // contains the range tombstone. + // [key000000#3,1, key000004#72057594037927935,15] + // [key000001#5,1, key000002#6,1] + Random rnd(301); + std::string value = RandomString(&rnd, kValueBytes); + for (int j = 0; j < kNumFilesPerLevel; ++j) { + // Give files overlapping key-ranges to prevent a trivial move when we + // compact from L0 to L1. + ASSERT_OK(Put(Key(j), value)); + ASSERT_OK(Put(Key(2 * kNumFilesPerLevel - 1 - j), value)); + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_EQ(j + 1, NumTableFilesAtLevel(0)); + } + // Compact the 2 L0 sstables to L1, resulting in the following LSM. There + // are 2 sstables generated in L1 due to the target_file_size_base setting. + // L1: + // [key000000#3,1, key000002#72057594037927935,15] + // [key000002#6,1, key000004#72057594037927935,15] + // L2: + // [key000000#1,1, key000000#1,1] + MoveFilesToLevel(1); + ASSERT_EQ(2, NumTableFilesAtLevel(1)); + + { + // Compact the second sstable in L1: + // L1: + // [key000000#3,1, key000002#72057594037927935,15] + // L2: + // [key000000#1,1, key000000#1,1] + // [key000002#6,1, key000004#72057594037927935,15] + // + // At the same time, verify the compaction does not cause the key at the + // endpoint (key000002#6,1) to disappear. + ASSERT_EQ(value, Get(Key(2))); + auto begin_str = Key(3); + const ROCKSDB_NAMESPACE::Slice begin = begin_str; + dbfull()->TEST_CompactRange(1, &begin, nullptr); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + ASSERT_EQ(2, NumTableFilesAtLevel(2)); + ASSERT_EQ(value, Get(Key(2))); + } + + { + // Compact the first sstable in L1. This should be copacetic, but + // was previously resulting in overlapping sstables in L2 due to + // mishandling of the range tombstone end-key when used as the + // largest key for an sstable. The resulting LSM structure should + // be: + // + // L2: + // [key000000#1,1, key000001#72057594037927935,15] + // [key000001#5,1, key000002#72057594037927935,15] + // [key000002#6,1, key000004#72057594037927935,15] + auto begin_str = Key(0); + const ROCKSDB_NAMESPACE::Slice begin = begin_str; + dbfull()->TEST_CompactRange(1, &begin, &begin); + ASSERT_EQ(0, NumTableFilesAtLevel(1)); + ASSERT_EQ(3, NumTableFilesAtLevel(2)); + } + + db_->ReleaseSnapshot(snapshot); +} + +TEST_F(DBRangeDelTest, UnorderedTombstones) { + // Regression test for #2752. Range delete tombstones between + // different snapshot stripes are not stored in order, so the first + // tombstone of each snapshot stripe should be checked as a smallest + // candidate. + Options options = CurrentOptions(); + DestroyAndReopen(options); + + auto cf = db_->DefaultColumnFamily(); + + ASSERT_OK(db_->Put(WriteOptions(), cf, "a", "a")); + ASSERT_OK(db_->Flush(FlushOptions(), cf)); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr)); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + + ASSERT_OK(db_->DeleteRange(WriteOptions(), cf, "b", "c")); + // Hold a snapshot to separate these two delete ranges. + auto snapshot = db_->GetSnapshot(); + ASSERT_OK(db_->DeleteRange(WriteOptions(), cf, "a", "b")); + ASSERT_OK(db_->Flush(FlushOptions(), cf)); + db_->ReleaseSnapshot(snapshot); + + std::vector> files; + dbfull()->TEST_GetFilesMetaData(cf, &files); + ASSERT_EQ(1, files[0].size()); + ASSERT_EQ("a", files[0][0].smallest.user_key()); + ASSERT_EQ("c", files[0][0].largest.user_key()); + + std::string v; + auto s = db_->Get(ReadOptions(), "a", &v); + ASSERT_TRUE(s.IsNotFound()); +} + +class MockMergeOperator : public MergeOperator { + // Mock non-associative operator. Non-associativity is expressed by lack of + // implementation for any `PartialMerge*` functions. + public: + bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override { + assert(merge_out != nullptr); + merge_out->new_value = merge_in.operand_list.back().ToString(); + return true; + } + + const char* Name() const override { return "MockMergeOperator"; } +}; + +TEST_F(DBRangeDelTest, KeyAtOverlappingEndpointReappears) { + // This test uses a non-associative merge operator since that is a convenient + // way to get compaction to write out files with overlapping user-keys at the + // endpoints. Note, however, overlapping endpoints can also occur with other + // value types (Put, etc.), assuming the right snapshots are present. + const int kFileBytes = 1 << 20; + const int kValueBytes = 1 << 10; + const int kNumFiles = 4; + + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.disable_auto_compactions = true; + options.merge_operator.reset(new MockMergeOperator()); + options.target_file_size_base = kFileBytes; + Reopen(options); + + // Push dummy data to L3 so that our actual test files on L0-L2 + // will not be considered "bottommost" level, otherwise compaction + // may prevent us from creating overlapping user keys + // as on the bottommost layer MergeHelper + ASSERT_OK(db_->Merge(WriteOptions(), "key", "dummy")); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(3); + + Random rnd(301); + const Snapshot* snapshot = nullptr; + for (int i = 0; i < kNumFiles; ++i) { + for (int j = 0; j < kFileBytes / kValueBytes; ++j) { + auto value = RandomString(&rnd, kValueBytes); + ASSERT_OK(db_->Merge(WriteOptions(), "key", value)); + } + if (i == kNumFiles - 1) { + // Take snapshot to prevent covered merge operands from being dropped by + // compaction. + snapshot = db_->GetSnapshot(); + // The DeleteRange is the last write so all merge operands are covered. + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + "key", "key_")); + } + ASSERT_OK(db_->Flush(FlushOptions())); + } + ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0)); + std::string value; + ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound()); + + dbfull()->TEST_CompactRange(0 /* level */, nullptr /* begin */, + nullptr /* end */, nullptr /* column_family */, + true /* disallow_trivial_move */); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + // Now we have multiple files at L1 all containing a single user key, thus + // guaranteeing overlap in the file endpoints. + ASSERT_GT(NumTableFilesAtLevel(1), 1); + + // Verify no merge operands reappeared after the compaction. + ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound()); + + // Compact and verify again. It's worthwhile because now the files have + // tighter endpoints, so we can verify that doesn't mess anything up. + dbfull()->TEST_CompactRange(1 /* level */, nullptr /* begin */, + nullptr /* end */, nullptr /* column_family */, + true /* disallow_trivial_move */); + ASSERT_GT(NumTableFilesAtLevel(2), 1); + ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound()); + + db_->ReleaseSnapshot(snapshot); +} + +TEST_F(DBRangeDelTest, UntruncatedTombstoneDoesNotDeleteNewerKey) { + // Verify a key newer than a range tombstone cannot be deleted by being + // compacted to the bottom level (and thus having its seqnum zeroed) before + // the range tombstone. This used to happen when range tombstones were + // untruncated on reads such that they extended past their file boundaries. + // + // Test summary: + // + // - L1 is bottommost. + // - A couple snapshots are strategically taken to prevent seqnums from being + // zeroed, range tombstone from being dropped, merge operands from being + // dropped, and merge operands from being combined. + // - Left half of files in L1 all have same user key, ensuring their file + // boundaries overlap. In the past this would cause range tombstones to be + // untruncated. + // - Right half of L1 files all have different keys, ensuring no overlap. + // - A range tombstone spans all L1 keys, so it is stored in every L1 file. + // - Keys in the right side of the key-range are overwritten. These are + // compacted down to L1 after releasing snapshots such that their seqnums + // will be zeroed. + // - A full range scan is performed. If the tombstone in the left L1 files + // were untruncated, it would now cover keys newer than it (but with zeroed + // seqnums) in the right L1 files. + const int kFileBytes = 1 << 20; + const int kValueBytes = 1 << 10; + const int kNumFiles = 4; + const int kMaxKey = kNumFiles* kFileBytes / kValueBytes; + const int kKeysOverwritten = 10; + + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.disable_auto_compactions = true; + options.merge_operator.reset(new MockMergeOperator()); + options.num_levels = 2; + options.target_file_size_base = kFileBytes; + Reopen(options); + + Random rnd(301); + // - snapshots[0] prevents merge operands from being combined during + // compaction. + // - snapshots[1] prevents merge operands from being dropped due to the + // covering range tombstone. + const Snapshot* snapshots[] = {nullptr, nullptr}; + for (int i = 0; i < kNumFiles; ++i) { + for (int j = 0; j < kFileBytes / kValueBytes; ++j) { + auto value = RandomString(&rnd, kValueBytes); + std::string key; + if (i < kNumFiles / 2) { + key = Key(0); + } else { + key = Key(1 + i * kFileBytes / kValueBytes + j); + } + ASSERT_OK(db_->Merge(WriteOptions(), key, value)); + } + if (i == 0) { + snapshots[0] = db_->GetSnapshot(); + } + if (i == kNumFiles - 1) { + snapshots[1] = db_->GetSnapshot(); + // The DeleteRange is the last write so all merge operands are covered. + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(0), Key(kMaxKey + 1))); + } + ASSERT_OK(db_->Flush(FlushOptions())); + } + ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0)); + + auto get_key_count = [this]() -> int { + auto* iter = db_->NewIterator(ReadOptions()); + iter->SeekToFirst(); + int keys_found = 0; + for (; iter->Valid(); iter->Next()) { + ++keys_found; + } + delete iter; + return keys_found; + }; + + // All keys should be covered + ASSERT_EQ(0, get_key_count()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr /* begin_key */, + nullptr /* end_key */)); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + // Roughly the left half of L1 files should have overlapping boundary keys, + // while the right half should not. + ASSERT_GE(NumTableFilesAtLevel(1), kNumFiles); + + // Now overwrite a few keys that are in L1 files that definitely don't have + // overlapping boundary keys. + for (int i = kMaxKey; i > kMaxKey - kKeysOverwritten; --i) { + auto value = RandomString(&rnd, kValueBytes); + ASSERT_OK(db_->Merge(WriteOptions(), Key(i), value)); + } + ASSERT_OK(db_->Flush(FlushOptions())); + + // The overwritten keys are in L0 now, so clearly aren't covered by the range + // tombstone in L1. + ASSERT_EQ(kKeysOverwritten, get_key_count()); + + // Release snapshots so seqnums can be zeroed when L0->L1 happens. + db_->ReleaseSnapshot(snapshots[0]); + db_->ReleaseSnapshot(snapshots[1]); + + auto begin_key_storage = Key(kMaxKey - kKeysOverwritten + 1); + auto end_key_storage = Key(kMaxKey); + Slice begin_key(begin_key_storage); + Slice end_key(end_key_storage); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &begin_key, &end_key)); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_GE(NumTableFilesAtLevel(1), kNumFiles); + + ASSERT_EQ(kKeysOverwritten, get_key_count()); +} + +TEST_F(DBRangeDelTest, DeletedMergeOperandReappearsIterPrev) { + // Exposes a bug where we were using + // `RangeDelPositioningMode::kBackwardTraversal` while scanning merge operands + // in the forward direction. Confusingly, this case happened during + // `DBIter::Prev`. It could cause assertion failure, or reappearing keys. + const int kFileBytes = 1 << 20; + const int kValueBytes = 1 << 10; + // Need multiple keys so we can get results when calling `Prev()` after + // `SeekToLast()`. + const int kNumKeys = 3; + const int kNumFiles = 4; + + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.disable_auto_compactions = true; + options.merge_operator.reset(new MockMergeOperator()); + options.target_file_size_base = kFileBytes; + Reopen(options); + + Random rnd(301); + const Snapshot* snapshot = nullptr; + for (int i = 0; i < kNumFiles; ++i) { + for (int j = 0; j < kFileBytes / kValueBytes; ++j) { + auto value = RandomString(&rnd, kValueBytes); + ASSERT_OK(db_->Merge(WriteOptions(), Key(j % kNumKeys), value)); + if (i == 0 && j == kNumKeys) { + // Take snapshot to prevent covered merge operands from being dropped or + // merged by compaction. + snapshot = db_->GetSnapshot(); + // Do a DeleteRange near the beginning so only the oldest merge operand + // for each key is covered. This ensures the sequence of events: + // + // - `DBIter::Prev()` is called + // - After several same versions of the same user key are encountered, + // it decides to seek using `DBIter::FindValueForCurrentKeyUsingSeek`. + // - Binary searches to the newest version of the key, which is in the + // leftmost file containing the user key. + // - Scans forwards to collect all merge operands. Eventually reaches + // the rightmost file containing the oldest merge operand, which + // should be covered by the `DeleteRange`. If `RangeDelAggregator` + // were not properly using `kForwardTraversal` here, that operand + // would reappear. + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(0), Key(kNumKeys + 1))); + } + } + ASSERT_OK(db_->Flush(FlushOptions())); + } + ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0)); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr /* begin_key */, + nullptr /* end_key */)); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_GT(NumTableFilesAtLevel(1), 1); + + auto* iter = db_->NewIterator(ReadOptions()); + iter->SeekToLast(); + int keys_found = 0; + for (; iter->Valid(); iter->Prev()) { + ++keys_found; + } + delete iter; + ASSERT_EQ(kNumKeys, keys_found); + + db_->ReleaseSnapshot(snapshot); +} + +TEST_F(DBRangeDelTest, SnapshotPreventsDroppedKeys) { + const int kFileBytes = 1 << 20; + + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.disable_auto_compactions = true; + options.target_file_size_base = kFileBytes; + Reopen(options); + + ASSERT_OK(Put(Key(0), "a")); + const Snapshot* snapshot = db_->GetSnapshot(); + + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), + Key(10))); + + db_->Flush(FlushOptions()); + + ReadOptions read_opts; + read_opts.snapshot = snapshot; + auto* iter = db_->NewIterator(read_opts); + + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(Key(0), iter->key()); + + iter->Next(); + ASSERT_FALSE(iter->Valid()); + + delete iter; + db_->ReleaseSnapshot(snapshot); +} + +TEST_F(DBRangeDelTest, SnapshotPreventsDroppedKeysInImmMemTables) { + const int kFileBytes = 1 << 20; + + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.disable_auto_compactions = true; + options.target_file_size_base = kFileBytes; + Reopen(options); + + // block flush thread -> pin immtables in memory + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency({ + {"SnapshotPreventsDroppedKeysInImmMemTables:AfterNewIterator", + "DBImpl::BGWorkFlush"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put(Key(0), "a")); + std::unique_ptr> + snapshot(db_->GetSnapshot(), + [this](const Snapshot* s) { db_->ReleaseSnapshot(s); }); + + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), + Key(10))); + + ASSERT_OK(dbfull()->TEST_SwitchMemtable()); + + ReadOptions read_opts; + read_opts.snapshot = snapshot.get(); + std::unique_ptr iter(db_->NewIterator(read_opts)); + + TEST_SYNC_POINT("SnapshotPreventsDroppedKeysInImmMemTables:AfterNewIterator"); + + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(Key(0), iter->key()); + + iter->Next(); + ASSERT_FALSE(iter->Valid()); +} + +TEST_F(DBRangeDelTest, RangeTombstoneWrittenToMinimalSsts) { + // Adapted from + // https://github.com/cockroachdb/cockroach/blob/de8b3ea603dd1592d9dc26443c2cc92c356fbc2f/pkg/storage/engine/rocksdb_test.go#L1267-L1398. + // Regression test for issue where range tombstone was written to more files + // than necessary when it began exactly at the begin key in the next + // compaction output file. + const int kFileBytes = 1 << 20; + const int kValueBytes = 4 << 10; + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.disable_auto_compactions = true; + // Have a bit of slack in the size limits but we enforce them more strictly + // when manually flushing/compacting. + options.max_compaction_bytes = 2 * kFileBytes; + options.target_file_size_base = 2 * kFileBytes; + options.write_buffer_size = 2 * kFileBytes; + Reopen(options); + + Random rnd(301); + for (char first_char : {'a', 'b', 'c'}) { + for (int i = 0; i < kFileBytes / kValueBytes; ++i) { + std::string key(1, first_char); + key.append(Key(i)); + std::string value = RandomString(&rnd, kValueBytes); + ASSERT_OK(Put(key, value)); + } + db_->Flush(FlushOptions()); + MoveFilesToLevel(2); + } + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_EQ(3, NumTableFilesAtLevel(2)); + + // Populate the memtable lightly while spanning the whole key-space. The + // setting of `max_compaction_bytes` will cause the L0->L1 to output multiple + // files to prevent a large L1->L2 compaction later. + ASSERT_OK(Put("a", "val")); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + "c" + Key(1), "d")); + // Our compaction output file cutting logic currently only considers point + // keys. So, in order for the range tombstone to have a chance at landing at + // the start of a new file, we need a point key at the range tombstone's + // start. + // TODO(ajkr): remove this `Put` after file cutting accounts for range + // tombstones (#3977). + ASSERT_OK(Put("c" + Key(1), "value")); + db_->Flush(FlushOptions()); + + // Ensure manual L0->L1 compaction cuts the outputs before the range tombstone + // and the range tombstone is only placed in the second SST. + std::string begin_key_storage("c" + Key(1)); + Slice begin_key(begin_key_storage); + std::string end_key_storage("d"); + Slice end_key(end_key_storage); + dbfull()->TEST_CompactRange(0 /* level */, &begin_key /* begin */, + &end_key /* end */, nullptr /* column_family */, + true /* disallow_trivial_move */); + ASSERT_EQ(2, NumTableFilesAtLevel(1)); + + std::vector all_metadata; + std::vector l1_metadata; + db_->GetLiveFilesMetaData(&all_metadata); + for (const auto& metadata : all_metadata) { + if (metadata.level == 1) { + l1_metadata.push_back(metadata); + } + } + std::sort(l1_metadata.begin(), l1_metadata.end(), + [&](const LiveFileMetaData& a, const LiveFileMetaData& b) { + return options.comparator->Compare(a.smallestkey, b.smallestkey) < + 0; + }); + ASSERT_EQ("a", l1_metadata[0].smallestkey); + ASSERT_EQ("a", l1_metadata[0].largestkey); + ASSERT_EQ("c" + Key(1), l1_metadata[1].smallestkey); + ASSERT_EQ("d", l1_metadata[1].largestkey); + + TablePropertiesCollection all_table_props; + ASSERT_OK(db_->GetPropertiesOfAllTables(&all_table_props)); + int64_t num_range_deletions = 0; + for (const auto& name_and_table_props : all_table_props) { + const auto& name = name_and_table_props.first; + const auto& table_props = name_and_table_props.second; + // The range tombstone should only be output to the second L1 SST. + if (name.size() >= l1_metadata[1].name.size() && + name.substr(name.size() - l1_metadata[1].name.size()).compare(l1_metadata[1].name) == 0) { + ASSERT_EQ(1, table_props->num_range_deletions); + ++num_range_deletions; + } else { + ASSERT_EQ(0, table_props->num_range_deletions); + } + } + ASSERT_EQ(1, num_range_deletions); +} + +TEST_F(DBRangeDelTest, OverlappedTombstones) { + const int kNumPerFile = 4, kNumFiles = 2; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.max_compaction_bytes = 9 * 1024; + DestroyAndReopen(options); + Random rnd(301); + for (int i = 0; i < kNumFiles; ++i) { + std::vector values; + // Write 12K (4 values, each 3K) + for (int j = 0; j < kNumPerFile; j++) { + values.push_back(RandomString(&rnd, 3 << 10)); + ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j])); + } + } + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + MoveFilesToLevel(2); + ASSERT_EQ(2, NumTableFilesAtLevel(2)); + + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(1), + Key((kNumFiles)*kNumPerFile + 1))); + ASSERT_OK(db_->Flush(FlushOptions())); + + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + + dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */); + + // The tombstone range is not broken up into multiple SSTs which may incur a + // large compaction with L2. + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + std::vector> files; + dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */); + ASSERT_EQ(1, NumTableFilesAtLevel(2)); + ASSERT_EQ(0, NumTableFilesAtLevel(1)); +} + +TEST_F(DBRangeDelTest, OverlappedKeys) { + const int kNumPerFile = 4, kNumFiles = 2; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.max_compaction_bytes = 9 * 1024; + DestroyAndReopen(options); + Random rnd(301); + for (int i = 0; i < kNumFiles; ++i) { + std::vector values; + // Write 12K (4 values, each 3K) + for (int j = 0; j < kNumPerFile; j++) { + values.push_back(RandomString(&rnd, 3 << 10)); + ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j])); + } + } + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + MoveFilesToLevel(2); + ASSERT_EQ(2, NumTableFilesAtLevel(2)); + + for (int i = 1; i < kNumFiles * kNumPerFile + 1; i++) { + ASSERT_OK(Put(Key(i), "0x123")); + } + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + + // The key range is broken up into three SSTs to avoid a future big compaction + // with the grandparent + dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */); + ASSERT_EQ(3, NumTableFilesAtLevel(1)); + + std::vector> files; + dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */); + ASSERT_EQ(1, NumTableFilesAtLevel(2)); + ASSERT_EQ(0, NumTableFilesAtLevel(1)); +} + +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_sst_test.cc b/src/rocksdb/db/db_sst_test.cc new file mode 100644 index 000000000..e0ecfb641 --- /dev/null +++ b/src/rocksdb/db/db_sst_test.cc @@ -0,0 +1,1227 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_test_util.h" +#include "file/sst_file_manager_impl.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/sst_file_manager.h" + +namespace ROCKSDB_NAMESPACE { + +class DBSSTTest : public DBTestBase { + public: + DBSSTTest() : DBTestBase("/db_sst_test") {} +}; + +#ifndef ROCKSDB_LITE +// A class which remembers the name of each flushed file. +class FlushedFileCollector : public EventListener { + public: + FlushedFileCollector() {} + ~FlushedFileCollector() override {} + + void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override { + std::lock_guard lock(mutex_); + flushed_files_.push_back(info.file_path); + } + + std::vector GetFlushedFiles() { + std::lock_guard lock(mutex_); + std::vector result; + for (auto fname : flushed_files_) { + result.push_back(fname); + } + return result; + } + void ClearFlushedFiles() { + std::lock_guard lock(mutex_); + flushed_files_.clear(); + } + + private: + std::vector flushed_files_; + std::mutex mutex_; +}; +#endif // ROCKSDB_LITE + +TEST_F(DBSSTTest, DontDeletePendingOutputs) { + Options options; + options.env = env_; + options.create_if_missing = true; + DestroyAndReopen(options); + + // Every time we write to a table file, call FOF/POF with full DB scan. This + // will make sure our pending_outputs_ protection work correctly + std::function purge_obsolete_files_function = [&]() { + JobContext job_context(0); + dbfull()->TEST_LockMutex(); + dbfull()->FindObsoleteFiles(&job_context, true /*force*/); + dbfull()->TEST_UnlockMutex(); + dbfull()->PurgeObsoleteFiles(job_context); + job_context.Clean(); + }; + + env_->table_write_callback_ = &purge_obsolete_files_function; + + for (int i = 0; i < 2; ++i) { + ASSERT_OK(Put("a", "begin")); + ASSERT_OK(Put("z", "end")); + ASSERT_OK(Flush()); + } + + // If pending output guard does not work correctly, PurgeObsoleteFiles() will + // delete the file that Compaction is trying to create, causing this: error + // db/db_test.cc:975: IO error: + // /tmp/rocksdbtest-1552237650/db_test/000009.sst: No such file or directory + Compact("a", "b"); +} + +// 1 Create some SST files by inserting K-V pairs into DB +// 2 Close DB and change suffix from ".sst" to ".ldb" for every other SST file +// 3 Open DB and check if all key can be read +TEST_F(DBSSTTest, SSTsWithLdbSuffixHandling) { + Options options = CurrentOptions(); + options.write_buffer_size = 110 << 10; // 110KB + options.num_levels = 4; + DestroyAndReopen(options); + + Random rnd(301); + int key_id = 0; + for (int i = 0; i < 10; ++i) { + GenerateNewFile(&rnd, &key_id, false); + } + Flush(); + Close(); + int const num_files = GetSstFileCount(dbname_); + ASSERT_GT(num_files, 0); + + Reopen(options); + std::vector values; + values.reserve(key_id); + for (int k = 0; k < key_id; ++k) { + values.push_back(Get(Key(k))); + } + Close(); + + std::vector filenames; + GetSstFiles(env_, dbname_, &filenames); + int num_ldb_files = 0; + for (size_t i = 0; i < filenames.size(); ++i) { + if (i & 1) { + continue; + } + std::string const rdb_name = dbname_ + "/" + filenames[i]; + std::string const ldb_name = Rocks2LevelTableFileName(rdb_name); + ASSERT_TRUE(env_->RenameFile(rdb_name, ldb_name).ok()); + ++num_ldb_files; + } + ASSERT_GT(num_ldb_files, 0); + ASSERT_EQ(num_files, GetSstFileCount(dbname_)); + + Reopen(options); + for (int k = 0; k < key_id; ++k) { + ASSERT_EQ(values[k], Get(Key(k))); + } + Destroy(options); +} + +// Check that we don't crash when opening DB with +// DBOptions::skip_checking_sst_file_sizes_on_db_open = true. +TEST_F(DBSSTTest, SkipCheckingSSTFileSizesOnDBOpen) { + ASSERT_OK(Put("pika", "choo")); + ASSERT_OK(Flush()); + + // Just open the DB with the option set to true and check that we don't crash. + Options options; + options.skip_checking_sst_file_sizes_on_db_open = true; + Reopen(options); + + ASSERT_EQ("choo", Get("pika")); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBSSTTest, DontDeleteMovedFile) { + // This test triggers move compaction and verifies that the file is not + // deleted when it's part of move compaction + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.max_bytes_for_level_base = 1024 * 1024; // 1 MB + options.level0_file_num_compaction_trigger = + 2; // trigger compaction when we have 2 files + DestroyAndReopen(options); + + Random rnd(301); + // Create two 1MB sst files + for (int i = 0; i < 2; ++i) { + // Create 1MB sst file + for (int j = 0; j < 100; ++j) { + ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024))); + } + ASSERT_OK(Flush()); + } + // this should execute both L0->L1 and L1->(move)->L2 compactions + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ("0,0,1", FilesPerLevel(0)); + + // If the moved file is actually deleted (the move-safeguard in + // ~Version::Version() is not there), we get this failure: + // Corruption: Can't access /000009.sst + Reopen(options); +} + +// This reproduces a bug where we don't delete a file because when it was +// supposed to be deleted, it was blocked by pending_outputs +// Consider: +// 1. current file_number is 13 +// 2. compaction (1) starts, blocks deletion of all files starting with 13 +// (pending outputs) +// 3. file 13 is created by compaction (2) +// 4. file 13 is consumed by compaction (3) and file 15 was created. Since file +// 13 has no references, it is put into VersionSet::obsolete_files_ +// 5. FindObsoleteFiles() gets file 13 from VersionSet::obsolete_files_. File 13 +// is deleted from obsolete_files_ set. +// 6. PurgeObsoleteFiles() tries to delete file 13, but this file is blocked by +// pending outputs since compaction (1) is still running. It is not deleted and +// it is not present in obsolete_files_ anymore. Therefore, we never delete it. +TEST_F(DBSSTTest, DeleteObsoleteFilesPendingOutputs) { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 2 * 1024 * 1024; // 2 MB + options.max_bytes_for_level_base = 1024 * 1024; // 1 MB + options.level0_file_num_compaction_trigger = + 2; // trigger compaction when we have 2 files + options.max_background_flushes = 2; + options.max_background_compactions = 2; + + OnFileDeletionListener* listener = new OnFileDeletionListener(); + options.listeners.emplace_back(listener); + + Reopen(options); + + Random rnd(301); + // Create two 1MB sst files + for (int i = 0; i < 2; ++i) { + // Create 1MB sst file + for (int j = 0; j < 100; ++j) { + ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024))); + } + ASSERT_OK(Flush()); + } + // this should execute both L0->L1 and L1->(move)->L2 compactions + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ("0,0,1", FilesPerLevel(0)); + + test::SleepingBackgroundTask blocking_thread; + port::Mutex mutex_; + bool already_blocked(false); + + // block the flush + std::function block_first_time = [&]() { + bool blocking = false; + { + MutexLock l(&mutex_); + if (!already_blocked) { + blocking = true; + already_blocked = true; + } + } + if (blocking) { + blocking_thread.DoSleep(); + } + }; + env_->table_write_callback_ = &block_first_time; + // Insert 2.5MB data, which should trigger a flush because we exceed + // write_buffer_size. The flush will be blocked with block_first_time + // pending_file is protecting all the files created after + for (int j = 0; j < 256; ++j) { + ASSERT_OK(Put(Key(j), RandomString(&rnd, 10 * 1024))); + } + blocking_thread.WaitUntilSleeping(); + + ASSERT_OK(dbfull()->TEST_CompactRange(2, nullptr, nullptr)); + + ASSERT_EQ("0,0,0,1", FilesPerLevel(0)); + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(metadata.size(), 1U); + auto file_on_L2 = metadata[0].name; + listener->SetExpectedFileName(dbname_ + file_on_L2); + + ASSERT_OK(dbfull()->TEST_CompactRange(3, nullptr, nullptr, nullptr, + true /* disallow trivial move */)); + ASSERT_EQ("0,0,0,0,1", FilesPerLevel(0)); + + // finish the flush! + blocking_thread.WakeUp(); + blocking_thread.WaitUntilDone(); + dbfull()->TEST_WaitForFlushMemTable(); + // File just flushed is too big for L0 and L1 so gets moved to L2. + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ("0,0,1,0,1", FilesPerLevel(0)); + + metadata.clear(); + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(metadata.size(), 2U); + + // This file should have been deleted during last compaction + ASSERT_EQ(Status::NotFound(), env_->FileExists(dbname_ + file_on_L2)); + listener->VerifyMatchedCount(1); +} + +TEST_F(DBSSTTest, DBWithSstFileManager) { + std::shared_ptr sst_file_manager(NewSstFileManager(env_)); + auto sfm = static_cast(sst_file_manager.get()); + + int files_added = 0; + int files_deleted = 0; + int files_moved = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::OnAddFile", [&](void* /*arg*/) { files_added++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::OnDeleteFile", + [&](void* /*arg*/) { files_deleted++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::OnMoveFile", [&](void* /*arg*/) { files_moved++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.sst_file_manager = sst_file_manager; + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < 25; i++) { + GenerateNewRandomFile(&rnd); + ASSERT_OK(Flush()); + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + // Verify that we are tracking all sst files in dbname_ + ASSERT_EQ(sfm->GetTrackedFiles(), GetAllSSTFiles()); + } + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + auto files_in_db = GetAllSSTFiles(); + // Verify that we are tracking all sst files in dbname_ + ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); + // Verify the total files size + uint64_t total_files_size = 0; + for (auto& file_to_size : files_in_db) { + total_files_size += file_to_size.second; + } + ASSERT_EQ(sfm->GetTotalSize(), total_files_size); + // We flushed at least 25 files + ASSERT_GE(files_added, 25); + // Compaction must have deleted some files + ASSERT_GT(files_deleted, 0); + // No files were moved + ASSERT_EQ(files_moved, 0); + + Close(); + Reopen(options); + ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); + ASSERT_EQ(sfm->GetTotalSize(), total_files_size); + + // Verify that we track all the files again after the DB is closed and opened + Close(); + sst_file_manager.reset(NewSstFileManager(env_)); + options.sst_file_manager = sst_file_manager; + sfm = static_cast(sst_file_manager.get()); + + Reopen(options); + ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); + ASSERT_EQ(sfm->GetTotalSize(), total_files_size); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBSSTTest, RateLimitedDelete) { + Destroy(last_options_); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"DBSSTTest::RateLimitedDelete:1", + "DeleteScheduler::BackgroundEmptyTrash"}, + }); + + std::vector penalties; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::BackgroundEmptyTrash:Wait", + [&](void* arg) { penalties.push_back(*(static_cast(arg))); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) { + // Turn timed wait into a simulated sleep + uint64_t* abs_time_us = static_cast(arg); + int64_t cur_time = 0; + env_->GetCurrentTime(&cur_time); + if (*abs_time_us > static_cast(cur_time)) { + env_->addon_time_.fetch_add(*abs_time_us - + static_cast(cur_time)); + } + + // Randomly sleep shortly + env_->addon_time_.fetch_add( + static_cast(Random::GetTLSInstance()->Uniform(10))); + + // Set wait until time to before current to force not to sleep. + int64_t real_cur_time = 0; + Env::Default()->GetCurrentTime(&real_cur_time); + *abs_time_us = static_cast(real_cur_time); + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + env_->no_slowdown_ = true; + env_->time_elapse_only_sleep_ = true; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + // Need to disable stats dumping and persisting which also use + // RepeatableThread, one of whose member variables is of type + // InstrumentedCondVar. The callback for + // InstrumentedCondVar::TimedWaitInternal can be triggered by stats dumping + // and persisting threads and cause time_spent_deleting measurement to become + // incorrect. + options.stats_dump_period_sec = 0; + options.stats_persist_period_sec = 0; + options.env = env_; + + int64_t rate_bytes_per_sec = 1024 * 10; // 10 Kbs / Sec + Status s; + options.sst_file_manager.reset( + NewSstFileManager(env_, nullptr, "", 0, false, &s, 0)); + ASSERT_OK(s); + options.sst_file_manager->SetDeleteRateBytesPerSecond(rate_bytes_per_sec); + auto sfm = static_cast(options.sst_file_manager.get()); + sfm->delete_scheduler()->SetMaxTrashDBRatio(1.1); + + WriteOptions wo; + wo.disableWAL = true; + ASSERT_OK(TryReopen(options)); + // Create 4 files in L0 + for (char v = 'a'; v <= 'd'; v++) { + ASSERT_OK(Put("Key2", DummyString(1024, v), wo)); + ASSERT_OK(Put("Key3", DummyString(1024, v), wo)); + ASSERT_OK(Put("Key4", DummyString(1024, v), wo)); + ASSERT_OK(Put("Key1", DummyString(1024, v), wo)); + ASSERT_OK(Put("Key4", DummyString(1024, v), wo)); + ASSERT_OK(Flush()); + } + // We created 4 sst files in L0 + ASSERT_EQ("4", FilesPerLevel(0)); + + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + + // Compaction will move the 4 files in L0 to trash and create 1 L1 file + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + ASSERT_EQ("0,1", FilesPerLevel(0)); + + uint64_t delete_start_time = env_->NowMicros(); + // Hold BackgroundEmptyTrash + TEST_SYNC_POINT("DBSSTTest::RateLimitedDelete:1"); + sfm->WaitForEmptyTrash(); + uint64_t time_spent_deleting = env_->NowMicros() - delete_start_time; + + uint64_t total_files_size = 0; + uint64_t expected_penlty = 0; + ASSERT_EQ(penalties.size(), metadata.size()); + for (size_t i = 0; i < metadata.size(); i++) { + total_files_size += metadata[i].size; + expected_penlty = ((total_files_size * 1000000) / rate_bytes_per_sec); + ASSERT_EQ(expected_penlty, penalties[i]); + } + ASSERT_GT(time_spent_deleting, expected_penlty * 0.9); + ASSERT_LT(time_spent_deleting, expected_penlty * 1.1); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBSSTTest, RateLimitedWALDelete) { + Destroy(last_options_); + + std::vector penalties; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::BackgroundEmptyTrash:Wait", + [&](void* arg) { penalties.push_back(*(static_cast(arg))); }); + + env_->no_slowdown_ = true; + env_->time_elapse_only_sleep_ = true; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.compression = kNoCompression; + options.env = env_; + + int64_t rate_bytes_per_sec = 1024 * 10; // 10 Kbs / Sec + Status s; + options.sst_file_manager.reset( + NewSstFileManager(env_, nullptr, "", 0, false, &s, 0)); + ASSERT_OK(s); + options.sst_file_manager->SetDeleteRateBytesPerSecond(rate_bytes_per_sec); + auto sfm = static_cast(options.sst_file_manager.get()); + sfm->delete_scheduler()->SetMaxTrashDBRatio(3.1); + + ASSERT_OK(TryReopen(options)); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Create 4 files in L0 + for (char v = 'a'; v <= 'd'; v++) { + ASSERT_OK(Put("Key2", DummyString(1024, v))); + ASSERT_OK(Put("Key3", DummyString(1024, v))); + ASSERT_OK(Put("Key4", DummyString(1024, v))); + ASSERT_OK(Put("Key1", DummyString(1024, v))); + ASSERT_OK(Put("Key4", DummyString(1024, v))); + ASSERT_OK(Flush()); + } + // We created 4 sst files in L0 + ASSERT_EQ("4", FilesPerLevel(0)); + + // Compaction will move the 4 files in L0 to trash and create 1 L1 file + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + ASSERT_EQ("0,1", FilesPerLevel(0)); + + sfm->WaitForEmptyTrash(); + ASSERT_EQ(penalties.size(), 8); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +class DBWALTestWithParam + : public DBSSTTest, + public testing::WithParamInterface> { + public: + DBWALTestWithParam() { + wal_dir_ = std::get<0>(GetParam()); + wal_dir_same_as_dbname_ = std::get<1>(GetParam()); + } + + std::string wal_dir_; + bool wal_dir_same_as_dbname_; +}; + +TEST_P(DBWALTestWithParam, WALTrashCleanupOnOpen) { + class MyEnv : public EnvWrapper { + public: + MyEnv(Env* t) : EnvWrapper(t), fake_log_delete(false) {} + + Status DeleteFile(const std::string& fname) { + if (fname.find(".log.trash") != std::string::npos && fake_log_delete) { + return Status::OK(); + } + + return target()->DeleteFile(fname); + } + + void set_fake_log_delete(bool fake) { fake_log_delete = fake; } + + private: + bool fake_log_delete; + }; + + std::unique_ptr env(new MyEnv(Env::Default())); + Destroy(last_options_); + + env->set_fake_log_delete(true); + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.compression = kNoCompression; + options.env = env.get(); + options.wal_dir = dbname_ + wal_dir_; + + int64_t rate_bytes_per_sec = 1024 * 10; // 10 Kbs / Sec + Status s; + options.sst_file_manager.reset( + NewSstFileManager(env_, nullptr, "", 0, false, &s, 0)); + ASSERT_OK(s); + options.sst_file_manager->SetDeleteRateBytesPerSecond(rate_bytes_per_sec); + auto sfm = static_cast(options.sst_file_manager.get()); + sfm->delete_scheduler()->SetMaxTrashDBRatio(3.1); + + ASSERT_OK(TryReopen(options)); + + // Create 4 files in L0 + for (char v = 'a'; v <= 'd'; v++) { + ASSERT_OK(Put("Key2", DummyString(1024, v))); + ASSERT_OK(Put("Key3", DummyString(1024, v))); + ASSERT_OK(Put("Key4", DummyString(1024, v))); + ASSERT_OK(Put("Key1", DummyString(1024, v))); + ASSERT_OK(Put("Key4", DummyString(1024, v))); + ASSERT_OK(Flush()); + } + // We created 4 sst files in L0 + ASSERT_EQ("4", FilesPerLevel(0)); + + Close(); + + options.sst_file_manager.reset(); + std::vector filenames; + int trash_log_count = 0; + if (!wal_dir_same_as_dbname_) { + // Forcibly create some trash log files + std::unique_ptr result; + env->NewWritableFile(options.wal_dir + "/1000.log.trash", &result, + EnvOptions()); + result.reset(); + } + env->GetChildren(options.wal_dir, &filenames); + for (const std::string& fname : filenames) { + if (fname.find(".log.trash") != std::string::npos) { + trash_log_count++; + } + } + ASSERT_GE(trash_log_count, 1); + + env->set_fake_log_delete(false); + ASSERT_OK(TryReopen(options)); + + filenames.clear(); + trash_log_count = 0; + env->GetChildren(options.wal_dir, &filenames); + for (const std::string& fname : filenames) { + if (fname.find(".log.trash") != std::string::npos) { + trash_log_count++; + } + } + ASSERT_EQ(trash_log_count, 0); + Close(); +} + +INSTANTIATE_TEST_CASE_P(DBWALTestWithParam, DBWALTestWithParam, + ::testing::Values(std::make_tuple("", true), + std::make_tuple("_wal_dir", false))); + +TEST_F(DBSSTTest, OpenDBWithExistingTrash) { + Options options = CurrentOptions(); + + options.sst_file_manager.reset( + NewSstFileManager(env_, nullptr, "", 1024 * 1024 /* 1 MB/sec */)); + auto sfm = static_cast(options.sst_file_manager.get()); + + Destroy(last_options_); + + // Add some trash files to the db directory so the DB can clean them up + env_->CreateDirIfMissing(dbname_); + ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "001.sst.trash")); + ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "002.sst.trash")); + ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "003.sst.trash")); + + // Reopen the DB and verify that it deletes existing trash files + ASSERT_OK(TryReopen(options)); + sfm->WaitForEmptyTrash(); + ASSERT_NOK(env_->FileExists(dbname_ + "/" + "001.sst.trash")); + ASSERT_NOK(env_->FileExists(dbname_ + "/" + "002.sst.trash")); + ASSERT_NOK(env_->FileExists(dbname_ + "/" + "003.sst.trash")); +} + + +// Create a DB with 2 db_paths, and generate multiple files in the 2 +// db_paths using CompactRangeOptions, make sure that files that were +// deleted from first db_path were deleted using DeleteScheduler and +// files in the second path were not. +TEST_F(DBSSTTest, DeleteSchedulerMultipleDBPaths) { + std::atomic bg_delete_file(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::DeleteTrashFile:DeleteFile", + [&](void* /*arg*/) { bg_delete_file++; }); + // The deletion scheduler sometimes skips marking file as trash according to + // a heuristic. In that case the deletion will go through the below SyncPoint. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::DeleteFile", [&](void* /*arg*/) { bg_delete_file++; }); + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.db_paths.emplace_back(dbname_, 1024 * 100); + options.db_paths.emplace_back(dbname_ + "_2", 1024 * 100); + options.env = env_; + + int64_t rate_bytes_per_sec = 1024 * 1024; // 1 Mb / Sec + Status s; + options.sst_file_manager.reset( + NewSstFileManager(env_, nullptr, "", rate_bytes_per_sec, false, &s, + /* max_trash_db_ratio= */ 1.1)); + + ASSERT_OK(s); + auto sfm = static_cast(options.sst_file_manager.get()); + + DestroyAndReopen(options); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + WriteOptions wo; + wo.disableWAL = true; + + // Create 4 files in L0 + for (int i = 0; i < 4; i++) { + ASSERT_OK(Put("Key" + ToString(i), DummyString(1024, 'A'), wo)); + ASSERT_OK(Flush()); + } + // We created 4 sst files in L0 + ASSERT_EQ("4", FilesPerLevel(0)); + // Compaction will delete files from L0 in first db path and generate a new + // file in L1 in second db path + CompactRangeOptions compact_options; + compact_options.target_path_id = 1; + Slice begin("Key0"); + Slice end("Key3"); + ASSERT_OK(db_->CompactRange(compact_options, &begin, &end)); + ASSERT_EQ("0,1", FilesPerLevel(0)); + + // Create 4 files in L0 + for (int i = 4; i < 8; i++) { + ASSERT_OK(Put("Key" + ToString(i), DummyString(1024, 'B'), wo)); + ASSERT_OK(Flush()); + } + ASSERT_EQ("4,1", FilesPerLevel(0)); + + // Compaction will delete files from L0 in first db path and generate a new + // file in L1 in second db path + begin = "Key4"; + end = "Key7"; + ASSERT_OK(db_->CompactRange(compact_options, &begin, &end)); + ASSERT_EQ("0,2", FilesPerLevel(0)); + + sfm->WaitForEmptyTrash(); + ASSERT_EQ(bg_delete_file, 8); + + // Compaction will delete both files and regenerate a file in L1 in second + // db path. The deleted files should still be cleaned up via delete scheduler. + compact_options.bottommost_level_compaction = + BottommostLevelCompaction::kForceOptimized; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + ASSERT_EQ("0,1", FilesPerLevel(0)); + + sfm->WaitForEmptyTrash(); + ASSERT_EQ(bg_delete_file, 10); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBSSTTest, DestroyDBWithRateLimitedDelete) { + int bg_delete_file = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::DeleteTrashFile:DeleteFile", + [&](void* /*arg*/) { bg_delete_file++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Status s; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.env = env_; + options.sst_file_manager.reset( + NewSstFileManager(env_, nullptr, "", 0, false, &s, 0)); + ASSERT_OK(s); + DestroyAndReopen(options); + + // Create 4 files in L0 + for (int i = 0; i < 4; i++) { + ASSERT_OK(Put("Key" + ToString(i), DummyString(1024, 'A'))); + ASSERT_OK(Flush()); + } + // We created 4 sst files in L0 + ASSERT_EQ("4", FilesPerLevel(0)); + + // Close DB and destroy it using DeleteScheduler + Close(); + + int num_sst_files = 0; + int num_wal_files = 0; + std::vector db_files; + env_->GetChildren(dbname_, &db_files); + for (std::string f : db_files) { + if (f.substr(f.find_last_of(".") + 1) == "sst") { + num_sst_files++; + } else if (f.substr(f.find_last_of(".") + 1) == "log") { + num_wal_files++; + } + } + ASSERT_GT(num_sst_files, 0); + ASSERT_GT(num_wal_files, 0); + + auto sfm = static_cast(options.sst_file_manager.get()); + + sfm->SetDeleteRateBytesPerSecond(1024 * 1024); + sfm->delete_scheduler()->SetMaxTrashDBRatio(1.1); + ASSERT_OK(DestroyDB(dbname_, options)); + sfm->WaitForEmptyTrash(); + ASSERT_EQ(bg_delete_file, num_sst_files + num_wal_files); +} + +TEST_F(DBSSTTest, DBWithMaxSpaceAllowed) { + std::shared_ptr sst_file_manager(NewSstFileManager(env_)); + auto sfm = static_cast(sst_file_manager.get()); + + Options options = CurrentOptions(); + options.sst_file_manager = sst_file_manager; + options.disable_auto_compactions = true; + DestroyAndReopen(options); + + Random rnd(301); + + // Generate a file containing 100 keys. + for (int i = 0; i < 100; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 50))); + } + ASSERT_OK(Flush()); + + uint64_t first_file_size = 0; + auto files_in_db = GetAllSSTFiles(&first_file_size); + ASSERT_EQ(sfm->GetTotalSize(), first_file_size); + + // Set the maximum allowed space usage to the current total size + sfm->SetMaxAllowedSpaceUsage(first_file_size + 1); + + ASSERT_OK(Put("key1", "val1")); + // This flush will cause bg_error_ and will fail + ASSERT_NOK(Flush()); +} + +TEST_F(DBSSTTest, CancellingCompactionsWorks) { + std::shared_ptr sst_file_manager(NewSstFileManager(env_)); + auto sfm = static_cast(sst_file_manager.get()); + + Options options = CurrentOptions(); + options.sst_file_manager = sst_file_manager; + options.level0_file_num_compaction_trigger = 2; + options.statistics = CreateDBStatistics(); + DestroyAndReopen(options); + + int completed_compactions = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction():CancelledCompaction", [&](void* /*arg*/) { + sfm->SetMaxAllowedSpaceUsage(0); + ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", + [&](void* /*arg*/) { completed_compactions++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + + // Generate a file containing 10 keys. + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 50))); + } + ASSERT_OK(Flush()); + uint64_t total_file_size = 0; + auto files_in_db = GetAllSSTFiles(&total_file_size); + // Set the maximum allowed space usage to the current total size + sfm->SetMaxAllowedSpaceUsage(2 * total_file_size + 1); + + // Generate another file to trigger compaction. + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 50))); + } + ASSERT_OK(Flush()); + dbfull()->TEST_WaitForCompact(true); + + // Because we set a callback in CancelledCompaction, we actually + // let the compaction run + ASSERT_GT(completed_compactions, 0); + ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0); + // Make sure the stat is bumped + ASSERT_GT(dbfull()->immutable_db_options().statistics.get()->getTickerCount(COMPACTION_CANCELLED), 0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBSSTTest, CancellingManualCompactionsWorks) { + std::shared_ptr sst_file_manager(NewSstFileManager(env_)); + auto sfm = static_cast(sst_file_manager.get()); + + Options options = CurrentOptions(); + options.sst_file_manager = sst_file_manager; + options.statistics = CreateDBStatistics(); + + FlushedFileCollector* collector = new FlushedFileCollector(); + options.listeners.emplace_back(collector); + + DestroyAndReopen(options); + + Random rnd(301); + + // Generate a file containing 10 keys. + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 50))); + } + ASSERT_OK(Flush()); + uint64_t total_file_size = 0; + auto files_in_db = GetAllSSTFiles(&total_file_size); + // Set the maximum allowed space usage to the current total size + sfm->SetMaxAllowedSpaceUsage(2 * total_file_size + 1); + + // Generate another file to trigger compaction. + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 50))); + } + ASSERT_OK(Flush()); + + // OK, now trigger a manual compaction + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + + // Wait for manual compaction to get scheduled and finish + dbfull()->TEST_WaitForCompact(true); + + ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0); + // Make sure the stat is bumped + ASSERT_EQ(dbfull()->immutable_db_options().statistics.get()->getTickerCount( + COMPACTION_CANCELLED), + 1); + + // Now make sure CompactFiles also gets cancelled + auto l0_files = collector->GetFlushedFiles(); + dbfull()->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), l0_files, 0); + + // Wait for manual compaction to get scheduled and finish + dbfull()->TEST_WaitForCompact(true); + + ASSERT_EQ(dbfull()->immutable_db_options().statistics.get()->getTickerCount( + COMPACTION_CANCELLED), + 2); + ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0); + + // Now let the flush through and make sure GetCompactionsReservedSize + // returns to normal + sfm->SetMaxAllowedSpaceUsage(0); + int completed_compactions = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactFilesImpl:End", [&](void* /*arg*/) { completed_compactions++; }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + dbfull()->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), l0_files, 0); + dbfull()->TEST_WaitForCompact(true); + + ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0); + ASSERT_GT(completed_compactions, 0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBSSTTest, DBWithMaxSpaceAllowedRandomized) { + // This test will set a maximum allowed space for the DB, then it will + // keep filling the DB until the limit is reached and bg_error_ is set. + // When bg_error_ is set we will verify that the DB size is greater + // than the limit. + + std::vector max_space_limits_mbs = {1, 10}; + std::atomic bg_error_set(false); + + std::atomic reached_max_space_on_flush(0); + std::atomic reached_max_space_on_compaction(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::FlushMemTableToOutputFile:MaxAllowedSpaceReached", + [&](void* arg) { + Status* bg_error = static_cast(arg); + bg_error_set = true; + reached_max_space_on_flush++; + // clear error to ensure compaction callback is called + *bg_error = Status::OK(); + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction():CancelledCompaction", [&](void* arg) { + bool* enough_room = static_cast(arg); + *enough_room = true; + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::FinishCompactionOutputFile:MaxAllowedSpaceReached", + [&](void* /*arg*/) { + bg_error_set = true; + reached_max_space_on_compaction++; + }); + + for (auto limit_mb : max_space_limits_mbs) { + bg_error_set = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + std::shared_ptr sst_file_manager(NewSstFileManager(env_)); + auto sfm = static_cast(sst_file_manager.get()); + + Options options = CurrentOptions(); + options.sst_file_manager = sst_file_manager; + options.write_buffer_size = 1024 * 512; // 512 Kb + DestroyAndReopen(options); + Random rnd(301); + + sfm->SetMaxAllowedSpaceUsage(limit_mb * 1024 * 1024); + + // It is easy to detect if the test is stuck in a loop. No need for + // complex termination logic. + while (true) { + auto s = Put(RandomString(&rnd, 10), RandomString(&rnd, 50)); + if (!s.ok()) { + break; + } + } + ASSERT_TRUE(bg_error_set); + uint64_t total_sst_files_size = 0; + GetAllSSTFiles(&total_sst_files_size); + ASSERT_GE(total_sst_files_size, limit_mb * 1024 * 1024); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } + + ASSERT_GT(reached_max_space_on_flush, 0); + ASSERT_GT(reached_max_space_on_compaction, 0); +} + +TEST_F(DBSSTTest, OpenDBWithInfiniteMaxOpenFiles) { + // Open DB with infinite max open files + // - First iteration use 1 thread to open files + // - Second iteration use 5 threads to open files + for (int iter = 0; iter < 2; iter++) { + Options options; + options.create_if_missing = true; + options.write_buffer_size = 100000; + options.disable_auto_compactions = true; + options.max_open_files = -1; + if (iter == 0) { + options.max_file_opening_threads = 1; + } else { + options.max_file_opening_threads = 5; + } + options = CurrentOptions(options); + DestroyAndReopen(options); + + // Create 12 Files in L0 (then move then to L2) + for (int i = 0; i < 12; i++) { + std::string k = "L2_" + Key(i); + ASSERT_OK(Put(k, k + std::string(1000, 'a'))); + ASSERT_OK(Flush()); + } + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 2; + db_->CompactRange(compact_options, nullptr, nullptr); + + // Create 12 Files in L0 + for (int i = 0; i < 12; i++) { + std::string k = "L0_" + Key(i); + ASSERT_OK(Put(k, k + std::string(1000, 'a'))); + ASSERT_OK(Flush()); + } + Close(); + + // Reopening the DB will load all existing files + Reopen(options); + ASSERT_EQ("12,0,12", FilesPerLevel(0)); + std::vector> files; + dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files); + + for (const auto& level : files) { + for (const auto& file : level) { + ASSERT_TRUE(file.table_reader_handle != nullptr); + } + } + + for (int i = 0; i < 12; i++) { + ASSERT_EQ(Get("L0_" + Key(i)), "L0_" + Key(i) + std::string(1000, 'a')); + ASSERT_EQ(Get("L2_" + Key(i)), "L2_" + Key(i) + std::string(1000, 'a')); + } + } +} + +TEST_F(DBSSTTest, GetTotalSstFilesSize) { + // We don't propagate oldest-key-time table property on compaction and + // just write 0 as default value. This affect the exact table size, since + // we encode table properties as varint64. Force time to be 0 to work around + // it. Should remove the workaround after we propagate the property on + // compaction. + std::unique_ptr mock_env(new MockTimeEnv(Env::Default())); + mock_env->set_current_time(0); + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.compression = kNoCompression; + options.env = mock_env.get(); + DestroyAndReopen(options); + // Generate 5 files in L0 + for (int i = 0; i < 5; i++) { + for (int j = 0; j < 10; j++) { + std::string val = "val_file_" + ToString(i); + ASSERT_OK(Put(Key(j), val)); + } + Flush(); + } + ASSERT_EQ("5", FilesPerLevel(0)); + + std::vector live_files_meta; + dbfull()->GetLiveFilesMetaData(&live_files_meta); + ASSERT_EQ(live_files_meta.size(), 5); + uint64_t single_file_size = live_files_meta[0].size; + + uint64_t live_sst_files_size = 0; + uint64_t total_sst_files_size = 0; + for (const auto& file_meta : live_files_meta) { + live_sst_files_size += file_meta.size; + } + + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", + &total_sst_files_size)); + // Live SST files = 5 + // Total SST files = 5 + ASSERT_EQ(live_sst_files_size, 5 * single_file_size); + ASSERT_EQ(total_sst_files_size, 5 * single_file_size); + + // hold current version + std::unique_ptr iter1(dbfull()->NewIterator(ReadOptions())); + + // Compact 5 files into 1 file in L0 + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,1", FilesPerLevel(0)); + + live_files_meta.clear(); + dbfull()->GetLiveFilesMetaData(&live_files_meta); + ASSERT_EQ(live_files_meta.size(), 1); + + live_sst_files_size = 0; + total_sst_files_size = 0; + for (const auto& file_meta : live_files_meta) { + live_sst_files_size += file_meta.size; + } + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", + &total_sst_files_size)); + // Live SST files = 1 (compacted file) + // Total SST files = 6 (5 original files + compacted file) + ASSERT_EQ(live_sst_files_size, 1 * single_file_size); + ASSERT_EQ(total_sst_files_size, 6 * single_file_size); + + // hold current version + std::unique_ptr iter2(dbfull()->NewIterator(ReadOptions())); + + // Delete all keys and compact, this will delete all live files + for (int i = 0; i < 10; i++) { + ASSERT_OK(Delete(Key(i))); + } + Flush(); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("", FilesPerLevel(0)); + + live_files_meta.clear(); + dbfull()->GetLiveFilesMetaData(&live_files_meta); + ASSERT_EQ(live_files_meta.size(), 0); + + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", + &total_sst_files_size)); + // Live SST files = 0 + // Total SST files = 6 (5 original files + compacted file) + ASSERT_EQ(total_sst_files_size, 6 * single_file_size); + + iter1.reset(); + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", + &total_sst_files_size)); + // Live SST files = 0 + // Total SST files = 1 (compacted file) + ASSERT_EQ(total_sst_files_size, 1 * single_file_size); + + iter2.reset(); + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", + &total_sst_files_size)); + // Live SST files = 0 + // Total SST files = 0 + ASSERT_EQ(total_sst_files_size, 0); + + // Close db before mock_env destruct. + Close(); +} + +TEST_F(DBSSTTest, GetTotalSstFilesSizeVersionsFilesShared) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.compression = kNoCompression; + DestroyAndReopen(options); + // Generate 5 files in L0 + for (int i = 0; i < 5; i++) { + ASSERT_OK(Put(Key(i), "val")); + Flush(); + } + ASSERT_EQ("5", FilesPerLevel(0)); + + std::vector live_files_meta; + dbfull()->GetLiveFilesMetaData(&live_files_meta); + ASSERT_EQ(live_files_meta.size(), 5); + uint64_t single_file_size = live_files_meta[0].size; + + uint64_t live_sst_files_size = 0; + uint64_t total_sst_files_size = 0; + for (const auto& file_meta : live_files_meta) { + live_sst_files_size += file_meta.size; + } + + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", + &total_sst_files_size)); + + // Live SST files = 5 + // Total SST files = 5 + ASSERT_EQ(live_sst_files_size, 5 * single_file_size); + ASSERT_EQ(total_sst_files_size, 5 * single_file_size); + + // hold current version + std::unique_ptr iter1(dbfull()->NewIterator(ReadOptions())); + + // Compaction will do trivial move from L0 to L1 + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,5", FilesPerLevel(0)); + + live_files_meta.clear(); + dbfull()->GetLiveFilesMetaData(&live_files_meta); + ASSERT_EQ(live_files_meta.size(), 5); + + live_sst_files_size = 0; + total_sst_files_size = 0; + for (const auto& file_meta : live_files_meta) { + live_sst_files_size += file_meta.size; + } + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", + &total_sst_files_size)); + // Live SST files = 5 + // Total SST files = 5 (used in 2 version) + ASSERT_EQ(live_sst_files_size, 5 * single_file_size); + ASSERT_EQ(total_sst_files_size, 5 * single_file_size); + + // hold current version + std::unique_ptr iter2(dbfull()->NewIterator(ReadOptions())); + + // Delete all keys and compact, this will delete all live files + for (int i = 0; i < 5; i++) { + ASSERT_OK(Delete(Key(i))); + } + Flush(); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("", FilesPerLevel(0)); + + live_files_meta.clear(); + dbfull()->GetLiveFilesMetaData(&live_files_meta); + ASSERT_EQ(live_files_meta.size(), 0); + + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", + &total_sst_files_size)); + // Live SST files = 0 + // Total SST files = 5 (used in 2 version) + ASSERT_EQ(total_sst_files_size, 5 * single_file_size); + + iter1.reset(); + iter2.reset(); + + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", + &total_sst_files_size)); + // Live SST files = 0 + // Total SST files = 0 + ASSERT_EQ(total_sst_files_size, 0); +} + +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_statistics_test.cc b/src/rocksdb/db/db_statistics_test.cc new file mode 100644 index 000000000..8fbbb96d5 --- /dev/null +++ b/src/rocksdb/db/db_statistics_test.cc @@ -0,0 +1,149 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include + +#include "db/db_test_util.h" +#include "monitoring/thread_status_util.h" +#include "port/stack_trace.h" +#include "rocksdb/statistics.h" + +namespace ROCKSDB_NAMESPACE { + +class DBStatisticsTest : public DBTestBase { + public: + DBStatisticsTest() : DBTestBase("/db_statistics_test") {} +}; + +TEST_F(DBStatisticsTest, CompressionStatsTest) { + CompressionType type; + + if (Snappy_Supported()) { + type = kSnappyCompression; + fprintf(stderr, "using snappy\n"); + } else if (Zlib_Supported()) { + type = kZlibCompression; + fprintf(stderr, "using zlib\n"); + } else if (BZip2_Supported()) { + type = kBZip2Compression; + fprintf(stderr, "using bzip2\n"); + } else if (LZ4_Supported()) { + type = kLZ4Compression; + fprintf(stderr, "using lz4\n"); + } else if (XPRESS_Supported()) { + type = kXpressCompression; + fprintf(stderr, "using xpress\n"); + } else if (ZSTD_Supported()) { + type = kZSTD; + fprintf(stderr, "using ZSTD\n"); + } else { + fprintf(stderr, "skipping test, compression disabled\n"); + return; + } + + Options options = CurrentOptions(); + options.compression = type; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex); + DestroyAndReopen(options); + + int kNumKeysWritten = 100000; + + // Check that compressions occur and are counted when compression is turned on + Random rnd(301); + for (int i = 0; i < kNumKeysWritten; ++i) { + // compressible string + ASSERT_OK(Put(Key(i), RandomString(&rnd, 128) + std::string(128, 'a'))); + } + ASSERT_OK(Flush()); + ASSERT_GT(options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED), 0); + + for (int i = 0; i < kNumKeysWritten; ++i) { + auto r = Get(Key(i)); + } + ASSERT_GT(options.statistics->getTickerCount(NUMBER_BLOCK_DECOMPRESSED), 0); + + options.compression = kNoCompression; + DestroyAndReopen(options); + uint64_t currentCompressions = + options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED); + uint64_t currentDecompressions = + options.statistics->getTickerCount(NUMBER_BLOCK_DECOMPRESSED); + + // Check that compressions do not occur when turned off + for (int i = 0; i < kNumKeysWritten; ++i) { + // compressible string + ASSERT_OK(Put(Key(i), RandomString(&rnd, 128) + std::string(128, 'a'))); + } + ASSERT_OK(Flush()); + ASSERT_EQ(options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED) + - currentCompressions, 0); + + for (int i = 0; i < kNumKeysWritten; ++i) { + auto r = Get(Key(i)); + } + ASSERT_EQ(options.statistics->getTickerCount(NUMBER_BLOCK_DECOMPRESSED) + - currentDecompressions, 0); +} + +TEST_F(DBStatisticsTest, MutexWaitStatsDisabledByDefault) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + CreateAndReopenWithCF({"pikachu"}, options); + const uint64_t kMutexWaitDelay = 100; + ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, + kMutexWaitDelay); + ASSERT_OK(Put("hello", "rocksdb")); + ASSERT_EQ(TestGetTickerCount(options, DB_MUTEX_WAIT_MICROS), 0); + ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0); +} + +TEST_F(DBStatisticsTest, MutexWaitStats) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.statistics->set_stats_level(StatsLevel::kAll); + CreateAndReopenWithCF({"pikachu"}, options); + const uint64_t kMutexWaitDelay = 100; + ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, + kMutexWaitDelay); + ASSERT_OK(Put("hello", "rocksdb")); + ASSERT_GE(TestGetTickerCount(options, DB_MUTEX_WAIT_MICROS), kMutexWaitDelay); + ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0); +} + +TEST_F(DBStatisticsTest, ResetStats) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + DestroyAndReopen(options); + for (int i = 0; i < 2; ++i) { + // pick arbitrary ticker and histogram. On first iteration they're zero + // because db is unused. On second iteration they're zero due to Reset(). + ASSERT_EQ(0, TestGetTickerCount(options, NUMBER_KEYS_WRITTEN)); + HistogramData histogram_data; + options.statistics->histogramData(DB_WRITE, &histogram_data); + ASSERT_EQ(0.0, histogram_data.max); + + if (i == 0) { + // The Put() makes some of the ticker/histogram stats nonzero until we + // Reset(). + ASSERT_OK(Put("hello", "rocksdb")); + ASSERT_EQ(1, TestGetTickerCount(options, NUMBER_KEYS_WRITTEN)); + options.statistics->histogramData(DB_WRITE, &histogram_data); + ASSERT_GT(histogram_data.max, 0.0); + options.statistics->Reset(); + } + } +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_table_properties_test.cc b/src/rocksdb/db/db_table_properties_test.cc new file mode 100644 index 000000000..e3499df70 --- /dev/null +++ b/src/rocksdb/db/db_table_properties_test.cc @@ -0,0 +1,336 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "rocksdb/db.h" +#include "rocksdb/utilities/table_properties_collectors.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" + +#ifndef ROCKSDB_LITE + +namespace ROCKSDB_NAMESPACE { + +// A helper function that ensures the table properties returned in +// `GetPropertiesOfAllTablesTest` is correct. +// This test assumes entries size is different for each of the tables. +namespace { + +void VerifyTableProperties(DB* db, uint64_t expected_entries_size) { + TablePropertiesCollection props; + ASSERT_OK(db->GetPropertiesOfAllTables(&props)); + + ASSERT_EQ(4U, props.size()); + std::unordered_set unique_entries; + + // Indirect test + uint64_t sum = 0; + for (const auto& item : props) { + unique_entries.insert(item.second->num_entries); + sum += item.second->num_entries; + } + + ASSERT_EQ(props.size(), unique_entries.size()); + ASSERT_EQ(expected_entries_size, sum); +} +} // namespace + +class DBTablePropertiesTest : public DBTestBase { + public: + DBTablePropertiesTest() : DBTestBase("/db_table_properties_test") {} + TablePropertiesCollection TestGetPropertiesOfTablesInRange( + std::vector ranges, std::size_t* num_properties = nullptr, + std::size_t* num_files = nullptr); +}; + +TEST_F(DBTablePropertiesTest, GetPropertiesOfAllTablesTest) { + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 8; + Reopen(options); + // Create 4 tables + for (int table = 0; table < 4; ++table) { + for (int i = 0; i < 10 + table; ++i) { + db_->Put(WriteOptions(), ToString(table * 100 + i), "val"); + } + db_->Flush(FlushOptions()); + } + + // 1. Read table properties directly from file + Reopen(options); + VerifyTableProperties(db_, 10 + 11 + 12 + 13); + + // 2. Put two tables to table cache and + Reopen(options); + // fetch key from 1st and 2nd table, which will internally place that table to + // the table cache. + for (int i = 0; i < 2; ++i) { + Get(ToString(i * 100 + 0)); + } + + VerifyTableProperties(db_, 10 + 11 + 12 + 13); + + // 3. Put all tables to table cache + Reopen(options); + // fetch key from 1st and 2nd table, which will internally place that table to + // the table cache. + for (int i = 0; i < 4; ++i) { + Get(ToString(i * 100 + 0)); + } + VerifyTableProperties(db_, 10 + 11 + 12 + 13); +} + +TablePropertiesCollection +DBTablePropertiesTest::TestGetPropertiesOfTablesInRange( + std::vector ranges, std::size_t* num_properties, + std::size_t* num_files) { + + // Since we deref zero element in the vector it can not be empty + // otherwise we pass an address to some random memory + EXPECT_GT(ranges.size(), 0U); + // run the query + TablePropertiesCollection props; + EXPECT_OK(db_->GetPropertiesOfTablesInRange( + db_->DefaultColumnFamily(), &ranges[0], ranges.size(), &props)); + + // Make sure that we've received properties for those and for those files + // only which fall within requested ranges + std::vector vmd; + db_->GetLiveFilesMetaData(&vmd); + for (auto& md : vmd) { + std::string fn = md.db_path + md.name; + bool in_range = false; + for (auto& r : ranges) { + // smallestkey < limit && largestkey >= start + if (r.limit.compare(md.smallestkey) >= 0 && + r.start.compare(md.largestkey) <= 0) { + in_range = true; + EXPECT_GT(props.count(fn), 0); + } + } + if (!in_range) { + EXPECT_EQ(props.count(fn), 0); + } + } + + if (num_properties) { + *num_properties = props.size(); + } + + if (num_files) { + *num_files = vmd.size(); + } + return props; +} + +TEST_F(DBTablePropertiesTest, GetPropertiesOfTablesInRange) { + // Fixed random sead + Random rnd(301); + + Options options; + options.create_if_missing = true; + options.write_buffer_size = 4096; + options.max_write_buffer_number = 2; + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 2; + options.level0_stop_writes_trigger = 2; + options.target_file_size_base = 2048; + options.max_bytes_for_level_base = 40960; + options.max_bytes_for_level_multiplier = 4; + options.hard_pending_compaction_bytes_limit = 16 * 1024; + options.num_levels = 8; + options.env = env_; + + DestroyAndReopen(options); + + // build a decent LSM + for (int i = 0; i < 10000; i++) { + ASSERT_OK(Put(test::RandomKey(&rnd, 5), RandomString(&rnd, 102))); + } + Flush(); + dbfull()->TEST_WaitForCompact(); + if (NumTableFilesAtLevel(0) == 0) { + ASSERT_OK(Put(test::RandomKey(&rnd, 5), RandomString(&rnd, 102))); + Flush(); + } + + db_->PauseBackgroundWork(); + + // Ensure that we have at least L0, L1 and L2 + ASSERT_GT(NumTableFilesAtLevel(0), 0); + ASSERT_GT(NumTableFilesAtLevel(1), 0); + ASSERT_GT(NumTableFilesAtLevel(2), 0); + + // Query the largest range + std::size_t num_properties, num_files; + TestGetPropertiesOfTablesInRange( + {Range(test::RandomKey(&rnd, 5, test::RandomKeyType::SMALLEST), + test::RandomKey(&rnd, 5, test::RandomKeyType::LARGEST))}, + &num_properties, &num_files); + ASSERT_EQ(num_properties, num_files); + + // Query the empty range + TestGetPropertiesOfTablesInRange( + {Range(test::RandomKey(&rnd, 5, test::RandomKeyType::LARGEST), + test::RandomKey(&rnd, 5, test::RandomKeyType::SMALLEST))}, + &num_properties, &num_files); + ASSERT_GT(num_files, 0); + ASSERT_EQ(num_properties, 0); + + // Query the middle rangee + TestGetPropertiesOfTablesInRange( + {Range(test::RandomKey(&rnd, 5, test::RandomKeyType::MIDDLE), + test::RandomKey(&rnd, 5, test::RandomKeyType::LARGEST))}, + &num_properties, &num_files); + ASSERT_GT(num_files, 0); + ASSERT_GT(num_files, num_properties); + ASSERT_GT(num_properties, 0); + + // Query a bunch of random ranges + for (int j = 0; j < 100; j++) { + // create a bunch of ranges + std::vector random_keys; + // Random returns numbers with zero included + // when we pass empty ranges TestGetPropertiesOfTablesInRange() + // derefs random memory in the empty ranges[0] + // so want to be greater than zero and even since + // the below loop requires that random_keys.size() to be even. + auto n = 2 * (rnd.Uniform(50) + 1); + + for (uint32_t i = 0; i < n; ++i) { + random_keys.push_back(test::RandomKey(&rnd, 5)); + } + + ASSERT_GT(random_keys.size(), 0U); + ASSERT_EQ((random_keys.size() % 2), 0U); + + std::vector ranges; + auto it = random_keys.begin(); + while (it != random_keys.end()) { + ranges.push_back(Range(*it, *(it + 1))); + it += 2; + } + + TestGetPropertiesOfTablesInRange(std::move(ranges)); + } +} + +TEST_F(DBTablePropertiesTest, GetColumnFamilyNameProperty) { + std::string kExtraCfName = "pikachu"; + CreateAndReopenWithCF({kExtraCfName}, CurrentOptions()); + + // Create one table per CF, then verify it was created with the column family + // name property. + for (uint32_t cf = 0; cf < 2; ++cf) { + Put(cf, "key", "val"); + Flush(cf); + + TablePropertiesCollection fname_to_props; + ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[cf], &fname_to_props)); + ASSERT_EQ(1U, fname_to_props.size()); + + std::string expected_cf_name; + if (cf > 0) { + expected_cf_name = kExtraCfName; + } else { + expected_cf_name = kDefaultColumnFamilyName; + } + ASSERT_EQ(expected_cf_name, + fname_to_props.begin()->second->column_family_name); + ASSERT_EQ(cf, static_cast( + fname_to_props.begin()->second->column_family_id)); + } +} + +TEST_F(DBTablePropertiesTest, DeletionTriggeredCompactionMarking) { + int kNumKeys = 1000; + int kWindowSize = 100; + int kNumDelsTrigger = 90; + std::shared_ptr compact_on_del = + NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger); + + Options opts = CurrentOptions(); + opts.table_properties_collector_factories.emplace_back(compact_on_del); + Reopen(opts); + + // add an L1 file to prevent tombstones from dropping due to obsolescence + // during flush + Put(Key(0), "val"); + Flush(); + MoveFilesToLevel(1); + + for (int i = 0; i < kNumKeys; ++i) { + if (i >= kNumKeys - kWindowSize && + i < kNumKeys - kWindowSize + kNumDelsTrigger) { + Delete(Key(i)); + } else { + Put(Key(i), "val"); + } + } + Flush(); + + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_GT(NumTableFilesAtLevel(1), 0); + + // Change the window size and deletion trigger and ensure new values take + // effect + kWindowSize = 50; + kNumDelsTrigger = 40; + static_cast + (compact_on_del.get())->SetWindowSize(kWindowSize); + static_cast + (compact_on_del.get())->SetDeletionTrigger(kNumDelsTrigger); + for (int i = 0; i < kNumKeys; ++i) { + if (i >= kNumKeys - kWindowSize && + i < kNumKeys - kWindowSize + kNumDelsTrigger) { + Delete(Key(i)); + } else { + Put(Key(i), "val"); + } + } + Flush(); + + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_GT(NumTableFilesAtLevel(1), 0); + + // Change the window size to disable delete triggered compaction + kWindowSize = 0; + static_cast + (compact_on_del.get())->SetWindowSize(kWindowSize); + static_cast + (compact_on_del.get())->SetDeletionTrigger(kNumDelsTrigger); + for (int i = 0; i < kNumKeys; ++i) { + if (i >= kNumKeys - kWindowSize && + i < kNumKeys - kWindowSize + kNumDelsTrigger) { + Delete(Key(i)); + } else { + Put(Key(i), "val"); + } + } + Flush(); + + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_tailing_iter_test.cc b/src/rocksdb/db/db_tailing_iter_test.cc new file mode 100644 index 000000000..39988638b --- /dev/null +++ b/src/rocksdb/db/db_tailing_iter_test.cc @@ -0,0 +1,547 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// Introduction of SyncPoint effectively disabled building and running this test +// in Release build. +// which is a pity, it is a good test +#if !defined(ROCKSDB_LITE) + +#include "db/db_test_util.h" +#include "db/forward_iterator.h" +#include "port/stack_trace.h" + +namespace ROCKSDB_NAMESPACE { + +class DBTestTailingIterator : public DBTestBase { + public: + DBTestTailingIterator() : DBTestBase("/db_tailing_iterator_test") {} +}; + +TEST_F(DBTestTailingIterator, TailingIteratorSingle) { + ReadOptions read_options; + read_options.tailing = true; + + std::unique_ptr iter(db_->NewIterator(read_options)); + iter->SeekToFirst(); + ASSERT_TRUE(!iter->Valid()); + + // add a record and check that iter can see it + ASSERT_OK(db_->Put(WriteOptions(), "mirko", "fodor")); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), "mirko"); + + iter->Next(); + ASSERT_TRUE(!iter->Valid()); +} + +TEST_F(DBTestTailingIterator, TailingIteratorKeepAdding) { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ReadOptions read_options; + read_options.tailing = true; + + std::unique_ptr iter(db_->NewIterator(read_options, handles_[1])); + std::string value(1024, 'a'); + + const int num_records = 10000; + for (int i = 0; i < num_records; ++i) { + char buf[32]; + snprintf(buf, sizeof(buf), "%016d", i); + + Slice key(buf, 16); + ASSERT_OK(Put(1, key, value)); + + iter->Seek(key); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(key), 0); + } +} + +TEST_F(DBTestTailingIterator, TailingIteratorSeekToNext) { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ReadOptions read_options; + read_options.tailing = true; + + std::unique_ptr iter(db_->NewIterator(read_options, handles_[1])); + std::unique_ptr itern(db_->NewIterator(read_options, handles_[1])); + std::string value(1024, 'a'); + + const int num_records = 1000; + for (int i = 1; i < num_records; ++i) { + char buf1[32]; + char buf2[32]; + snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5); + + Slice key(buf1, 20); + ASSERT_OK(Put(1, key, value)); + + if (i % 100 == 99) { + ASSERT_OK(Flush(1)); + } + + snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2); + Slice target(buf2, 20); + iter->Seek(target); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(key), 0); + if (i == 1) { + itern->SeekToFirst(); + } else { + itern->Next(); + } + ASSERT_TRUE(itern->Valid()); + ASSERT_EQ(itern->key().compare(key), 0); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + for (int i = 2 * num_records; i > 0; --i) { + char buf1[32]; + char buf2[32]; + snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5); + + Slice key(buf1, 20); + ASSERT_OK(Put(1, key, value)); + + if (i % 100 == 99) { + ASSERT_OK(Flush(1)); + } + + snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2); + Slice target(buf2, 20); + iter->Seek(target); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(key), 0); + } +} + +TEST_F(DBTestTailingIterator, TailingIteratorTrimSeekToNext) { + const uint64_t k150KB = 150 * 1024; + Options options; + options.write_buffer_size = k150KB; + options.max_write_buffer_number = 3; + options.min_write_buffer_number_to_merge = 2; + options.env = env_; + CreateAndReopenWithCF({"pikachu"}, options); + ReadOptions read_options; + read_options.tailing = true; + int num_iters, deleted_iters; + + char bufe[32]; + snprintf(bufe, sizeof(bufe), "00b0%016d", 0); + Slice keyu(bufe, 20); + read_options.iterate_upper_bound = &keyu; + std::unique_ptr iter(db_->NewIterator(read_options, handles_[1])); + std::unique_ptr itern(db_->NewIterator(read_options, handles_[1])); + std::unique_ptr iterh(db_->NewIterator(read_options, handles_[1])); + std::string value(1024, 'a'); + bool file_iters_deleted = false; + bool file_iters_renewed_null = false; + bool file_iters_renewed_copy = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "ForwardIterator::SeekInternal:Return", [&](void* arg) { + ForwardIterator* fiter = reinterpret_cast(arg); + ASSERT_TRUE(!file_iters_deleted || + fiter->TEST_CheckDeletedIters(&deleted_iters, &num_iters)); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "ForwardIterator::Next:Return", [&](void* arg) { + ForwardIterator* fiter = reinterpret_cast(arg); + ASSERT_TRUE(!file_iters_deleted || + fiter->TEST_CheckDeletedIters(&deleted_iters, &num_iters)); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "ForwardIterator::RenewIterators:Null", + [&](void* /*arg*/) { file_iters_renewed_null = true; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "ForwardIterator::RenewIterators:Copy", + [&](void* /*arg*/) { file_iters_renewed_copy = true; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + const int num_records = 1000; + for (int i = 1; i < num_records; ++i) { + char buf1[32]; + char buf2[32]; + char buf3[32]; + char buf4[32]; + snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5); + snprintf(buf3, sizeof(buf3), "00b0%016d", i * 5); + + Slice key(buf1, 20); + ASSERT_OK(Put(1, key, value)); + Slice keyn(buf3, 20); + ASSERT_OK(Put(1, keyn, value)); + + if (i % 100 == 99) { + ASSERT_OK(Flush(1)); + dbfull()->TEST_WaitForCompact(); + if (i == 299) { + file_iters_deleted = true; + } + snprintf(buf4, sizeof(buf4), "00a0%016d", i * 5 / 2); + Slice target(buf4, 20); + iterh->Seek(target); + ASSERT_TRUE(iter->Valid()); + for (int j = (i + 1) * 5 / 2; j < i * 5; j += 5) { + iterh->Next(); + ASSERT_TRUE(iterh->Valid()); + } + if (i == 299) { + file_iters_deleted = false; + } + } + + file_iters_deleted = true; + snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2); + Slice target(buf2, 20); + iter->Seek(target); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(key), 0); + ASSERT_LE(num_iters, 1); + if (i == 1) { + itern->SeekToFirst(); + } else { + itern->Next(); + } + ASSERT_TRUE(itern->Valid()); + ASSERT_EQ(itern->key().compare(key), 0); + ASSERT_LE(num_iters, 1); + file_iters_deleted = false; + } + ASSERT_TRUE(file_iters_renewed_null); + ASSERT_TRUE(file_iters_renewed_copy); + iter = nullptr; + itern = nullptr; + iterh = nullptr; + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + table_options.block_cache_compressed = nullptr; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + read_options.read_tier = kBlockCacheTier; + std::unique_ptr iteri(db_->NewIterator(read_options, handles_[1])); + char buf5[32]; + snprintf(buf5, sizeof(buf5), "00a0%016d", (num_records / 2) * 5 - 2); + Slice target1(buf5, 20); + iteri->Seek(target1); + ASSERT_TRUE(iteri->status().IsIncomplete()); + iteri = nullptr; + + read_options.read_tier = kReadAllTier; + options.table_factory.reset(NewBlockBasedTableFactory()); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + iter.reset(db_->NewIterator(read_options, handles_[1])); + for (int i = 2 * num_records; i > 0; --i) { + char buf1[32]; + char buf2[32]; + snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5); + + Slice key(buf1, 20); + ASSERT_OK(Put(1, key, value)); + + if (i % 100 == 99) { + ASSERT_OK(Flush(1)); + } + + snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2); + Slice target(buf2, 20); + iter->Seek(target); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(key), 0); + } +} + +TEST_F(DBTestTailingIterator, TailingIteratorDeletes) { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ReadOptions read_options; + read_options.tailing = true; + + std::unique_ptr iter(db_->NewIterator(read_options, handles_[1])); + + // write a single record, read it using the iterator, then delete it + ASSERT_OK(Put(1, "0test", "test")); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), "0test"); + ASSERT_OK(Delete(1, "0test")); + + // write many more records + const int num_records = 10000; + std::string value(1024, 'A'); + + for (int i = 0; i < num_records; ++i) { + char buf[32]; + snprintf(buf, sizeof(buf), "1%015d", i); + + Slice key(buf, 16); + ASSERT_OK(Put(1, key, value)); + } + + // force a flush to make sure that no records are read from memtable + ASSERT_OK(Flush(1)); + + // skip "0test" + iter->Next(); + + // make sure we can read all new records using the existing iterator + int count = 0; + for (; iter->Valid(); iter->Next(), ++count) ; + + ASSERT_EQ(count, num_records); +} + +TEST_F(DBTestTailingIterator, TailingIteratorPrefixSeek) { + ReadOptions read_options; + read_options.tailing = true; + + Options options = CurrentOptions(); + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.prefix_extractor.reset(NewFixedPrefixTransform(2)); + options.memtable_factory.reset(NewHashSkipListRepFactory(16)); + options.allow_concurrent_memtable_write = false; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + std::unique_ptr iter(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(Put(1, "0101", "test")); + + ASSERT_OK(Flush(1)); + + ASSERT_OK(Put(1, "0202", "test")); + + // Seek(0102) shouldn't find any records since 0202 has a different prefix + iter->Seek("0102"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("0202"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), "0202"); + + iter->Next(); + ASSERT_TRUE(!iter->Valid()); +} + +TEST_F(DBTestTailingIterator, TailingIteratorIncomplete) { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ReadOptions read_options; + read_options.tailing = true; + read_options.read_tier = kBlockCacheTier; + + std::string key("key"); + std::string value("value"); + + ASSERT_OK(db_->Put(WriteOptions(), key, value)); + + std::unique_ptr iter(db_->NewIterator(read_options)); + iter->SeekToFirst(); + // we either see the entry or it's not in cache + ASSERT_TRUE(iter->Valid() || iter->status().IsIncomplete()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + iter->SeekToFirst(); + // should still be true after compaction + ASSERT_TRUE(iter->Valid() || iter->status().IsIncomplete()); +} + +TEST_F(DBTestTailingIterator, TailingIteratorSeekToSame) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.write_buffer_size = 1000; + CreateAndReopenWithCF({"pikachu"}, options); + + ReadOptions read_options; + read_options.tailing = true; + + const int NROWS = 10000; + // Write rows with keys 00000, 00002, 00004 etc. + for (int i = 0; i < NROWS; ++i) { + char buf[100]; + snprintf(buf, sizeof(buf), "%05d", 2*i); + std::string key(buf); + std::string value("value"); + ASSERT_OK(db_->Put(WriteOptions(), key, value)); + } + + std::unique_ptr iter(db_->NewIterator(read_options)); + // Seek to 00001. We expect to find 00002. + std::string start_key = "00001"; + iter->Seek(start_key); + ASSERT_TRUE(iter->Valid()); + + std::string found = iter->key().ToString(); + ASSERT_EQ("00002", found); + + // Now seek to the same key. The iterator should remain in the same + // position. + iter->Seek(found); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(found, iter->key().ToString()); +} + +// Sets iterate_upper_bound and verifies that ForwardIterator doesn't call +// Seek() on immutable iterators when target key is >= prev_key and all +// iterators, including the memtable iterator, are over the upper bound. +TEST_F(DBTestTailingIterator, TailingIteratorUpperBound) { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + + const Slice upper_bound("20", 3); + ReadOptions read_options; + read_options.tailing = true; + read_options.iterate_upper_bound = &upper_bound; + + ASSERT_OK(Put(1, "11", "11")); + ASSERT_OK(Put(1, "12", "12")); + ASSERT_OK(Put(1, "22", "22")); + ASSERT_OK(Flush(1)); // flush all those keys to an immutable SST file + + // Add another key to the memtable. + ASSERT_OK(Put(1, "21", "21")); + + std::unique_ptr it(db_->NewIterator(read_options, handles_[1])); + it->Seek("12"); + ASSERT_TRUE(it->Valid()); + ASSERT_EQ("12", it->key().ToString()); + + it->Next(); + // Not valid since "21" is over the upper bound. + ASSERT_FALSE(it->Valid()); + + // This keeps track of the number of times NeedToSeekImmutable() was true. + int immutable_seeks = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "ForwardIterator::SeekInternal:Immutable", + [&](void* /*arg*/) { ++immutable_seeks; }); + + // Seek to 13. This should not require any immutable seeks. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + it->Seek("13"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + ASSERT_FALSE(it->Valid()); + ASSERT_EQ(0, immutable_seeks); +} + +TEST_F(DBTestTailingIterator, TailingIteratorGap) { + // level 1: [20, 25] [35, 40] + // level 2: [10 - 15] [45 - 50] + // level 3: [20, 30, 40] + // Previously there is a bug in tailing_iterator that if there is a gap in + // lower level, the key will be skipped if it is within the range between + // the largest key of index n file and the smallest key of index n+1 file + // if both file fit in that gap. In this example, 25 < key < 35 + // https://github.com/facebook/rocksdb/issues/1372 + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + + ReadOptions read_options; + read_options.tailing = true; + + ASSERT_OK(Put(1, "20", "20")); + ASSERT_OK(Put(1, "30", "30")); + ASSERT_OK(Put(1, "40", "40")); + ASSERT_OK(Flush(1)); + MoveFilesToLevel(3, 1); + + ASSERT_OK(Put(1, "10", "10")); + ASSERT_OK(Put(1, "15", "15")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "45", "45")); + ASSERT_OK(Put(1, "50", "50")); + ASSERT_OK(Flush(1)); + MoveFilesToLevel(2, 1); + + ASSERT_OK(Put(1, "20", "20")); + ASSERT_OK(Put(1, "25", "25")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "35", "35")); + ASSERT_OK(Put(1, "40", "40")); + ASSERT_OK(Flush(1)); + MoveFilesToLevel(1, 1); + + ColumnFamilyMetaData meta; + db_->GetColumnFamilyMetaData(handles_[1], &meta); + + std::unique_ptr it(db_->NewIterator(read_options, handles_[1])); + it->Seek("30"); + ASSERT_TRUE(it->Valid()); + ASSERT_EQ("30", it->key().ToString()); + + it->Next(); + ASSERT_TRUE(it->Valid()); + ASSERT_EQ("35", it->key().ToString()); + + it->Next(); + ASSERT_TRUE(it->Valid()); + ASSERT_EQ("40", it->key().ToString()); +} + +TEST_F(DBTestTailingIterator, SeekWithUpperBoundBug) { + ReadOptions read_options; + read_options.tailing = true; + const Slice upper_bound("cc", 3); + read_options.iterate_upper_bound = &upper_bound; + + + // 1st L0 file + ASSERT_OK(db_->Put(WriteOptions(), "aa", "SEEN")); + ASSERT_OK(Flush()); + + // 2nd L0 file + ASSERT_OK(db_->Put(WriteOptions(), "zz", "NOT-SEEN")); + ASSERT_OK(Flush()); + + std::unique_ptr iter(db_->NewIterator(read_options)); + + iter->Seek("aa"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), "aa"); +} + +TEST_F(DBTestTailingIterator, SeekToFirstWithUpperBoundBug) { + ReadOptions read_options; + read_options.tailing = true; + const Slice upper_bound("cc", 3); + read_options.iterate_upper_bound = &upper_bound; + + + // 1st L0 file + ASSERT_OK(db_->Put(WriteOptions(), "aa", "SEEN")); + ASSERT_OK(Flush()); + + // 2nd L0 file + ASSERT_OK(db_->Put(WriteOptions(), "zz", "NOT-SEEN")); + ASSERT_OK(Flush()); + + std::unique_ptr iter(db_->NewIterator(read_options)); + + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), "aa"); + + iter->Next(); + ASSERT_FALSE(iter->Valid()); + + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), "aa"); +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // !defined(ROCKSDB_LITE) + +int main(int argc, char** argv) { +#if !defined(ROCKSDB_LITE) + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +#else + (void) argc; + (void) argv; + return 0; +#endif +} diff --git a/src/rocksdb/db/db_test.cc b/src/rocksdb/db/db_test.cc new file mode 100644 index 000000000..60b4d60f4 --- /dev/null +++ b/src/rocksdb/db/db_test.cc @@ -0,0 +1,6605 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// Introduction of SyncPoint effectively disabled building and running this test +// in Release build. +// which is a pity, it is a good test +#include +#include +#include +#include +#include +#include +#ifndef OS_WIN +#include +#endif +#ifdef OS_SOLARIS +#include +#endif + +#include "cache/lru_cache.h" +#include "db/blob_index.h" +#include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" +#include "db/dbformat.h" +#include "db/job_context.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "env/mock_env.h" +#include "file/filename.h" +#include "memtable/hash_linklist_rep.h" +#include "monitoring/thread_status_util.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/convenience.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/experimental.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/options.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/snapshot.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "rocksdb/thread_status.h" +#include "rocksdb/utilities/checkpoint.h" +#include "rocksdb/utilities/optimistic_transaction_db.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/mock_table.h" +#include "table/plain/plain_table_factory.h" +#include "table/scoped_arena_iterator.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/compression.h" +#include "util/mutexlock.h" +#include "util/rate_limiter.h" +#include "util/string_util.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { + +class DBTest : public DBTestBase { + public: + DBTest() : DBTestBase("/db_test") {} +}; + +class DBTestWithParam + : public DBTest, + public testing::WithParamInterface> { + public: + DBTestWithParam() { + max_subcompactions_ = std::get<0>(GetParam()); + exclusive_manual_compaction_ = std::get<1>(GetParam()); + } + + // Required if inheriting from testing::WithParamInterface<> + static void SetUpTestCase() {} + static void TearDownTestCase() {} + + uint32_t max_subcompactions_; + bool exclusive_manual_compaction_; +}; + +TEST_F(DBTest, MockEnvTest) { + std::unique_ptr env{new MockEnv(Env::Default())}; + Options options; + options.create_if_missing = true; + options.env = env.get(); + DB* db; + + const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")}; + const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")}; + + ASSERT_OK(DB::Open(options, "/dir/db", &db)); + for (size_t i = 0; i < 3; ++i) { + ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i])); + } + + for (size_t i = 0; i < 3; ++i) { + std::string res; + ASSERT_OK(db->Get(ReadOptions(), keys[i], &res)); + ASSERT_TRUE(res == vals[i]); + } + + Iterator* iterator = db->NewIterator(ReadOptions()); + iterator->SeekToFirst(); + for (size_t i = 0; i < 3; ++i) { + ASSERT_TRUE(iterator->Valid()); + ASSERT_TRUE(keys[i] == iterator->key()); + ASSERT_TRUE(vals[i] == iterator->value()); + iterator->Next(); + } + ASSERT_TRUE(!iterator->Valid()); + delete iterator; + +// TEST_FlushMemTable() is not supported in ROCKSDB_LITE +#ifndef ROCKSDB_LITE + DBImpl* dbi = reinterpret_cast(db); + ASSERT_OK(dbi->TEST_FlushMemTable()); + + for (size_t i = 0; i < 3; ++i) { + std::string res; + ASSERT_OK(db->Get(ReadOptions(), keys[i], &res)); + ASSERT_TRUE(res == vals[i]); + } +#endif // ROCKSDB_LITE + + delete db; +} + +// NewMemEnv returns nullptr in ROCKSDB_LITE since class InMemoryEnv isn't +// defined. +#ifndef ROCKSDB_LITE +TEST_F(DBTest, MemEnvTest) { + std::unique_ptr env{NewMemEnv(Env::Default())}; + Options options; + options.create_if_missing = true; + options.env = env.get(); + DB* db; + + const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")}; + const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")}; + + ASSERT_OK(DB::Open(options, "/dir/db", &db)); + for (size_t i = 0; i < 3; ++i) { + ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i])); + } + + for (size_t i = 0; i < 3; ++i) { + std::string res; + ASSERT_OK(db->Get(ReadOptions(), keys[i], &res)); + ASSERT_TRUE(res == vals[i]); + } + + Iterator* iterator = db->NewIterator(ReadOptions()); + iterator->SeekToFirst(); + for (size_t i = 0; i < 3; ++i) { + ASSERT_TRUE(iterator->Valid()); + ASSERT_TRUE(keys[i] == iterator->key()); + ASSERT_TRUE(vals[i] == iterator->value()); + iterator->Next(); + } + ASSERT_TRUE(!iterator->Valid()); + delete iterator; + + DBImpl* dbi = reinterpret_cast(db); + ASSERT_OK(dbi->TEST_FlushMemTable()); + + for (size_t i = 0; i < 3; ++i) { + std::string res; + ASSERT_OK(db->Get(ReadOptions(), keys[i], &res)); + ASSERT_TRUE(res == vals[i]); + } + + delete db; + + options.create_if_missing = false; + ASSERT_OK(DB::Open(options, "/dir/db", &db)); + for (size_t i = 0; i < 3; ++i) { + std::string res; + ASSERT_OK(db->Get(ReadOptions(), keys[i], &res)); + ASSERT_TRUE(res == vals[i]); + } + delete db; +} +#endif // ROCKSDB_LITE + +TEST_F(DBTest, WriteEmptyBatch) { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 100000; + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "foo", "bar")); + WriteOptions wo; + wo.sync = true; + wo.disableWAL = false; + WriteBatch empty_batch; + ASSERT_OK(dbfull()->Write(wo, &empty_batch)); + + // make sure we can re-open it. + ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options)); + ASSERT_EQ("bar", Get(1, "foo")); +} + +TEST_F(DBTest, SkipDelay) { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 100000; + CreateAndReopenWithCF({"pikachu"}, options); + + for (bool sync : {true, false}) { + for (bool disableWAL : {true, false}) { + if (sync && disableWAL) { + // sync and disableWAL is incompatible. + continue; + } + // Use a small number to ensure a large delay that is still effective + // when we do Put + // TODO(myabandeh): this is time dependent and could potentially make + // the test flaky + auto token = dbfull()->TEST_write_controler().GetDelayToken(1); + std::atomic sleep_count(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::DelayWrite:Sleep", + [&](void* /*arg*/) { sleep_count.fetch_add(1); }); + std::atomic wait_count(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::DelayWrite:Wait", + [&](void* /*arg*/) { wait_count.fetch_add(1); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + WriteOptions wo; + wo.sync = sync; + wo.disableWAL = disableWAL; + wo.no_slowdown = true; + dbfull()->Put(wo, "foo", "bar"); + // We need the 2nd write to trigger delay. This is because delay is + // estimated based on the last write size which is 0 for the first write. + ASSERT_NOK(dbfull()->Put(wo, "foo2", "bar2")); + ASSERT_GE(sleep_count.load(), 0); + ASSERT_GE(wait_count.load(), 0); + token.reset(); + + token = dbfull()->TEST_write_controler().GetDelayToken(1000000000); + wo.no_slowdown = false; + ASSERT_OK(dbfull()->Put(wo, "foo3", "bar3")); + ASSERT_GE(sleep_count.load(), 1); + token.reset(); + } + } +} + +TEST_F(DBTest, MixedSlowdownOptions) { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 100000; + CreateAndReopenWithCF({"pikachu"}, options); + std::vector threads; + std::atomic thread_num(0); + + std::function write_slowdown_func = [&]() { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + WriteOptions wo; + wo.no_slowdown = false; + ASSERT_OK(dbfull()->Put(wo, key, "bar")); + }; + std::function write_no_slowdown_func = [&]() { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + WriteOptions wo; + wo.no_slowdown = true; + ASSERT_NOK(dbfull()->Put(wo, key, "bar")); + }; + // Use a small number to ensure a large delay that is still effective + // when we do Put + // TODO(myabandeh): this is time dependent and could potentially make + // the test flaky + auto token = dbfull()->TEST_write_controler().GetDelayToken(1); + std::atomic sleep_count(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::DelayWrite:BeginWriteStallDone", [&](void* /*arg*/) { + sleep_count.fetch_add(1); + if (threads.empty()) { + for (int i = 0; i < 2; ++i) { + threads.emplace_back(write_slowdown_func); + } + for (int i = 0; i < 2; ++i) { + threads.emplace_back(write_no_slowdown_func); + } + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + WriteOptions wo; + wo.sync = false; + wo.disableWAL = false; + wo.no_slowdown = false; + dbfull()->Put(wo, "foo", "bar"); + // We need the 2nd write to trigger delay. This is because delay is + // estimated based on the last write size which is 0 for the first write. + ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2")); + token.reset(); + + for (auto& t : threads) { + t.join(); + } + ASSERT_GE(sleep_count.load(), 1); + + wo.no_slowdown = true; + ASSERT_OK(dbfull()->Put(wo, "foo3", "bar")); +} + +TEST_F(DBTest, MixedSlowdownOptionsInQueue) { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 100000; + CreateAndReopenWithCF({"pikachu"}, options); + std::vector threads; + std::atomic thread_num(0); + + std::function write_no_slowdown_func = [&]() { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + WriteOptions wo; + wo.no_slowdown = true; + ASSERT_NOK(dbfull()->Put(wo, key, "bar")); + }; + // Use a small number to ensure a large delay that is still effective + // when we do Put + // TODO(myabandeh): this is time dependent and could potentially make + // the test flaky + auto token = dbfull()->TEST_write_controler().GetDelayToken(1); + std::atomic sleep_count(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::DelayWrite:Sleep", [&](void* /*arg*/) { + sleep_count.fetch_add(1); + if (threads.empty()) { + for (int i = 0; i < 2; ++i) { + threads.emplace_back(write_no_slowdown_func); + } + // Sleep for 2s to allow the threads to insert themselves into the + // write queue + env_->SleepForMicroseconds(3000000ULL); + } + }); + std::atomic wait_count(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::DelayWrite:Wait", + [&](void* /*arg*/) { wait_count.fetch_add(1); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + WriteOptions wo; + wo.sync = false; + wo.disableWAL = false; + wo.no_slowdown = false; + dbfull()->Put(wo, "foo", "bar"); + // We need the 2nd write to trigger delay. This is because delay is + // estimated based on the last write size which is 0 for the first write. + ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2")); + token.reset(); + + for (auto& t : threads) { + t.join(); + } + ASSERT_EQ(sleep_count.load(), 1); + ASSERT_GE(wait_count.load(), 0); +} + +TEST_F(DBTest, MixedSlowdownOptionsStop) { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 100000; + CreateAndReopenWithCF({"pikachu"}, options); + std::vector threads; + std::atomic thread_num(0); + + std::function write_slowdown_func = [&]() { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + WriteOptions wo; + wo.no_slowdown = false; + ASSERT_OK(dbfull()->Put(wo, key, "bar")); + }; + std::function write_no_slowdown_func = [&]() { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + WriteOptions wo; + wo.no_slowdown = true; + ASSERT_NOK(dbfull()->Put(wo, key, "bar")); + }; + std::function wakeup_writer = [&]() { + dbfull()->mutex_.Lock(); + dbfull()->bg_cv_.SignalAll(); + dbfull()->mutex_.Unlock(); + }; + // Use a small number to ensure a large delay that is still effective + // when we do Put + // TODO(myabandeh): this is time dependent and could potentially make + // the test flaky + auto token = dbfull()->TEST_write_controler().GetStopToken(); + std::atomic wait_count(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::DelayWrite:Wait", [&](void* /*arg*/) { + wait_count.fetch_add(1); + if (threads.empty()) { + for (int i = 0; i < 2; ++i) { + threads.emplace_back(write_slowdown_func); + } + for (int i = 0; i < 2; ++i) { + threads.emplace_back(write_no_slowdown_func); + } + // Sleep for 2s to allow the threads to insert themselves into the + // write queue + env_->SleepForMicroseconds(3000000ULL); + } + token.reset(); + threads.emplace_back(wakeup_writer); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + WriteOptions wo; + wo.sync = false; + wo.disableWAL = false; + wo.no_slowdown = false; + dbfull()->Put(wo, "foo", "bar"); + // We need the 2nd write to trigger delay. This is because delay is + // estimated based on the last write size which is 0 for the first write. + ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2")); + token.reset(); + + for (auto& t : threads) { + t.join(); + } + ASSERT_GE(wait_count.load(), 1); + + wo.no_slowdown = true; + ASSERT_OK(dbfull()->Put(wo, "foo3", "bar")); +} +#ifndef ROCKSDB_LITE + +TEST_F(DBTest, LevelLimitReopen) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + + const std::string value(1024 * 1024, ' '); + int i = 0; + while (NumTableFilesAtLevel(2, 1) == 0) { + ASSERT_OK(Put(1, Key(i++), value)); + } + + options.num_levels = 1; + options.max_bytes_for_level_multiplier_additional.resize(1, 1); + Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ(s.IsInvalidArgument(), true); + ASSERT_EQ(s.ToString(), + "Invalid argument: db has more levels than options.num_levels"); + + options.num_levels = 10; + options.max_bytes_for_level_multiplier_additional.resize(10, 1); + ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options)); +} +#endif // ROCKSDB_LITE + + +TEST_F(DBTest, PutSingleDeleteGet) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_OK(Put(1, "foo2", "v2")); + ASSERT_EQ("v2", Get(1, "foo2")); + ASSERT_OK(SingleDelete(1, "foo")); + ASSERT_EQ("NOT_FOUND", Get(1, "foo")); + // Skip FIFO and universal compaction beccause they do not apply to the test + // case. Skip MergePut because single delete does not get removed when it + // encounters a merge. + } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction | + kSkipMergePut)); +} + +TEST_F(DBTest, ReadFromPersistedTier) { + do { + Random rnd(301); + Options options = CurrentOptions(); + for (int disableWAL = 0; disableWAL <= 1; ++disableWAL) { + CreateAndReopenWithCF({"pikachu"}, options); + WriteOptions wopt; + wopt.disableWAL = (disableWAL == 1); + // 1st round: put but not flush + ASSERT_OK(db_->Put(wopt, handles_[1], "foo", "first")); + ASSERT_OK(db_->Put(wopt, handles_[1], "bar", "one")); + ASSERT_EQ("first", Get(1, "foo")); + ASSERT_EQ("one", Get(1, "bar")); + + // Read directly from persited data. + ReadOptions ropt; + ropt.read_tier = kPersistedTier; + std::string value; + if (wopt.disableWAL) { + // as data has not yet being flushed, we expect not found. + ASSERT_TRUE(db_->Get(ropt, handles_[1], "foo", &value).IsNotFound()); + ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).IsNotFound()); + } else { + ASSERT_OK(db_->Get(ropt, handles_[1], "foo", &value)); + ASSERT_OK(db_->Get(ropt, handles_[1], "bar", &value)); + } + + // Multiget + std::vector multiget_cfs; + multiget_cfs.push_back(handles_[1]); + multiget_cfs.push_back(handles_[1]); + std::vector multiget_keys; + multiget_keys.push_back("foo"); + multiget_keys.push_back("bar"); + std::vector multiget_values; + auto statuses = + db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values); + if (wopt.disableWAL) { + ASSERT_TRUE(statuses[0].IsNotFound()); + ASSERT_TRUE(statuses[1].IsNotFound()); + } else { + ASSERT_OK(statuses[0]); + ASSERT_OK(statuses[1]); + } + + // 2nd round: flush and put a new value in memtable. + ASSERT_OK(Flush(1)); + ASSERT_OK(db_->Put(wopt, handles_[1], "rocksdb", "hello")); + + // once the data has been flushed, we are able to get the + // data when kPersistedTier is used. + ASSERT_TRUE(db_->Get(ropt, handles_[1], "foo", &value).ok()); + ASSERT_EQ(value, "first"); + ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).ok()); + ASSERT_EQ(value, "one"); + if (wopt.disableWAL) { + ASSERT_TRUE( + db_->Get(ropt, handles_[1], "rocksdb", &value).IsNotFound()); + } else { + ASSERT_OK(db_->Get(ropt, handles_[1], "rocksdb", &value)); + ASSERT_EQ(value, "hello"); + } + + // Expect same result in multiget + multiget_cfs.push_back(handles_[1]); + multiget_keys.push_back("rocksdb"); + statuses = + db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values); + ASSERT_TRUE(statuses[0].ok()); + ASSERT_EQ("first", multiget_values[0]); + ASSERT_TRUE(statuses[1].ok()); + ASSERT_EQ("one", multiget_values[1]); + if (wopt.disableWAL) { + ASSERT_TRUE(statuses[2].IsNotFound()); + } else { + ASSERT_OK(statuses[2]); + } + + // 3rd round: delete and flush + ASSERT_OK(db_->Delete(wopt, handles_[1], "foo")); + Flush(1); + ASSERT_OK(db_->Delete(wopt, handles_[1], "bar")); + + ASSERT_TRUE(db_->Get(ropt, handles_[1], "foo", &value).IsNotFound()); + if (wopt.disableWAL) { + // Still expect finding the value as its delete has not yet being + // flushed. + ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).ok()); + ASSERT_EQ(value, "one"); + } else { + ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).IsNotFound()); + } + ASSERT_TRUE(db_->Get(ropt, handles_[1], "rocksdb", &value).ok()); + ASSERT_EQ(value, "hello"); + + statuses = + db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values); + ASSERT_TRUE(statuses[0].IsNotFound()); + if (wopt.disableWAL) { + ASSERT_TRUE(statuses[1].ok()); + ASSERT_EQ("one", multiget_values[1]); + } else { + ASSERT_TRUE(statuses[1].IsNotFound()); + } + ASSERT_TRUE(statuses[2].ok()); + ASSERT_EQ("hello", multiget_values[2]); + if (wopt.disableWAL == 0) { + DestroyAndReopen(options); + } + } + } while (ChangeOptions()); +} + +TEST_F(DBTest, SingleDeleteFlush) { + // Test to check whether flushing preserves a single delete hidden + // behind a put. + do { + Random rnd(301); + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + CreateAndReopenWithCF({"pikachu"}, options); + + // Put values on second level (so that they will not be in the same + // compaction as the other operations. + Put(1, "foo", "first"); + Put(1, "bar", "one"); + ASSERT_OK(Flush(1)); + MoveFilesToLevel(2, 1); + + // (Single) delete hidden by a put + SingleDelete(1, "foo"); + Put(1, "foo", "second"); + Delete(1, "bar"); + Put(1, "bar", "two"); + ASSERT_OK(Flush(1)); + + SingleDelete(1, "foo"); + Delete(1, "bar"); + ASSERT_OK(Flush(1)); + + dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr); + + ASSERT_EQ("NOT_FOUND", Get(1, "bar")); + ASSERT_EQ("NOT_FOUND", Get(1, "foo")); + // Skip FIFO and universal compaction beccause they do not apply to the test + // case. Skip MergePut because single delete does not get removed when it + // encounters a merge. + } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction | + kSkipMergePut)); +} + +TEST_F(DBTest, SingleDeletePutFlush) { + // Single deletes that encounter the matching put in a flush should get + // removed. + do { + Random rnd(301); + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + CreateAndReopenWithCF({"pikachu"}, options); + + Put(1, "foo", Slice()); + Put(1, "a", Slice()); + SingleDelete(1, "a"); + ASSERT_OK(Flush(1)); + + ASSERT_EQ("[ ]", AllEntriesFor("a", 1)); + // Skip FIFO and universal compaction beccause they do not apply to the test + // case. Skip MergePut because single delete does not get removed when it + // encounters a merge. + } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction | + kSkipMergePut)); +} + +// Disable because not all platform can run it. +// It requires more than 9GB memory to run it, With single allocation +// of more than 3GB. +TEST_F(DBTest, DISABLED_SanitizeVeryVeryLargeValue) { + const size_t kValueSize = 4 * size_t{1024 * 1024 * 1024}; // 4GB value + std::string raw(kValueSize, 'v'); + Options options = CurrentOptions(); + options.env = env_; + options.merge_operator = MergeOperators::CreatePutOperator(); + options.write_buffer_size = 100000; // Small write buffer + options.paranoid_checks = true; + DestroyAndReopen(options); + + ASSERT_OK(Put("boo", "v1")); + ASSERT_TRUE(Put("foo", raw).IsInvalidArgument()); + ASSERT_TRUE(Merge("foo", raw).IsInvalidArgument()); + + WriteBatch wb; + ASSERT_TRUE(wb.Put("foo", raw).IsInvalidArgument()); + ASSERT_TRUE(wb.Merge("foo", raw).IsInvalidArgument()); + + Slice value_slice = raw; + Slice key_slice = "foo"; + SliceParts sp_key(&key_slice, 1); + SliceParts sp_value(&value_slice, 1); + + ASSERT_TRUE(wb.Put(sp_key, sp_value).IsInvalidArgument()); + ASSERT_TRUE(wb.Merge(sp_key, sp_value).IsInvalidArgument()); +} + +// Disable because not all platform can run it. +// It requires more than 9GB memory to run it, With single allocation +// of more than 3GB. +TEST_F(DBTest, DISABLED_VeryLargeValue) { + const size_t kValueSize = 3221225472u; // 3GB value + const size_t kKeySize = 8388608u; // 8MB key + std::string raw(kValueSize, 'v'); + std::string key1(kKeySize, 'c'); + std::string key2(kKeySize, 'd'); + + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 100000; // Small write buffer + options.paranoid_checks = true; + DestroyAndReopen(options); + + ASSERT_OK(Put("boo", "v1")); + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Put(key1, raw)); + raw[0] = 'w'; + ASSERT_OK(Put(key2, raw)); + dbfull()->TEST_WaitForFlushMemTable(); + +#ifndef ROCKSDB_LITE + ASSERT_EQ(1, NumTableFilesAtLevel(0)); +#endif // !ROCKSDB_LITE + + std::string value; + Status s = db_->Get(ReadOptions(), key1, &value); + ASSERT_OK(s); + ASSERT_EQ(kValueSize, value.size()); + ASSERT_EQ('v', value[0]); + + s = db_->Get(ReadOptions(), key2, &value); + ASSERT_OK(s); + ASSERT_EQ(kValueSize, value.size()); + ASSERT_EQ('w', value[0]); + + // Compact all files. + Flush(); + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + + // Check DB is not in read-only state. + ASSERT_OK(Put("boo", "v1")); + + s = db_->Get(ReadOptions(), key1, &value); + ASSERT_OK(s); + ASSERT_EQ(kValueSize, value.size()); + ASSERT_EQ('v', value[0]); + + s = db_->Get(ReadOptions(), key2, &value); + ASSERT_OK(s); + ASSERT_EQ(kValueSize, value.size()); + ASSERT_EQ('w', value[0]); +} + +TEST_F(DBTest, GetFromImmutableLayer) { + do { + Options options = CurrentOptions(); + options.env = env_; + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_EQ("v1", Get(1, "foo")); + + // Block sync calls + env_->delay_sstable_sync_.store(true, std::memory_order_release); + Put(1, "k1", std::string(100000, 'x')); // Fill memtable + Put(1, "k2", std::string(100000, 'y')); // Trigger flush + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("NOT_FOUND", Get(0, "foo")); + // Release sync calls + env_->delay_sstable_sync_.store(false, std::memory_order_release); + } while (ChangeOptions()); +} + + +TEST_F(DBTest, GetLevel0Ordering) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + // Check that we process level-0 files in correct order. The code + // below generates two level-0 files where the earlier one comes + // before the later one in the level-0 file list since the earlier + // one has a smaller "smallest" key. + ASSERT_OK(Put(1, "bar", "b")); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "foo", "v2")); + ASSERT_OK(Flush(1)); + ASSERT_EQ("v2", Get(1, "foo")); + } while (ChangeOptions()); +} + +TEST_F(DBTest, WrongLevel0Config) { + Options options = CurrentOptions(); + Close(); + ASSERT_OK(DestroyDB(dbname_, options)); + options.level0_stop_writes_trigger = 1; + options.level0_slowdown_writes_trigger = 2; + options.level0_file_num_compaction_trigger = 3; + ASSERT_OK(DB::Open(options, dbname_, &db_)); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBTest, GetOrderedByLevels) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "foo", "v1")); + Compact(1, "a", "z"); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_OK(Put(1, "foo", "v2")); + ASSERT_EQ("v2", Get(1, "foo")); + ASSERT_OK(Flush(1)); + ASSERT_EQ("v2", Get(1, "foo")); + } while (ChangeOptions()); +} + +TEST_F(DBTest, GetPicksCorrectFile) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + // Arrange to have multiple files in a non-level-0 level. + ASSERT_OK(Put(1, "a", "va")); + Compact(1, "a", "b"); + ASSERT_OK(Put(1, "x", "vx")); + Compact(1, "x", "y"); + ASSERT_OK(Put(1, "f", "vf")); + Compact(1, "f", "g"); + ASSERT_EQ("va", Get(1, "a")); + ASSERT_EQ("vf", Get(1, "f")); + ASSERT_EQ("vx", Get(1, "x")); + } while (ChangeOptions()); +} + +TEST_F(DBTest, GetEncountersEmptyLevel) { + do { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + // Arrange for the following to happen: + // * sstable A in level 0 + // * nothing in level 1 + // * sstable B in level 2 + // Then do enough Get() calls to arrange for an automatic compaction + // of sstable A. A bug would cause the compaction to be marked as + // occurring at level 1 (instead of the correct level 0). + + // Step 1: First place sstables in levels 0 and 2 + Put(1, "a", "begin"); + Put(1, "z", "end"); + ASSERT_OK(Flush(1)); + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); + Put(1, "a", "begin"); + Put(1, "z", "end"); + ASSERT_OK(Flush(1)); + ASSERT_GT(NumTableFilesAtLevel(0, 1), 0); + ASSERT_GT(NumTableFilesAtLevel(2, 1), 0); + + // Step 2: clear level 1 if necessary. + dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1); + ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0); + ASSERT_EQ(NumTableFilesAtLevel(2, 1), 1); + + // Step 3: read a bunch of times + for (int i = 0; i < 1000; i++) { + ASSERT_EQ("NOT_FOUND", Get(1, "missing")); + } + + // Step 4: Wait for compaction to finish + dbfull()->TEST_WaitForCompact(); + + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1); // XXX + } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction)); +} +#endif // ROCKSDB_LITE + +TEST_F(DBTest, FlushMultipleMemtable) { + do { + Options options = CurrentOptions(); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 3; + options.max_write_buffer_size_to_maintain = -1; + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1")); + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); + + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v1", Get(1, "bar")); + ASSERT_OK(Flush(1)); + } while (ChangeCompactOptions()); +} +#ifndef ROCKSDB_LITE +TEST_F(DBTest, FlushSchedule) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.level0_stop_writes_trigger = 1 << 10; + options.level0_slowdown_writes_trigger = 1 << 10; + options.min_write_buffer_number_to_merge = 1; + options.max_write_buffer_size_to_maintain = + static_cast(options.write_buffer_size); + options.max_write_buffer_number = 2; + options.write_buffer_size = 120 * 1024; + CreateAndReopenWithCF({"pikachu"}, options); + std::vector threads; + + std::atomic thread_num(0); + // each column family will have 5 thread, each thread generating 2 memtables. + // each column family should end up with 10 table files + std::function fill_memtable_func = [&]() { + int a = thread_num.fetch_add(1); + Random rnd(a); + WriteOptions wo; + // this should fill up 2 memtables + for (int k = 0; k < 5000; ++k) { + ASSERT_OK(db_->Put(wo, handles_[a & 1], RandomString(&rnd, 13), "")); + } + }; + + for (int i = 0; i < 10; ++i) { + threads.emplace_back(fill_memtable_func); + } + + for (auto& t : threads) { + t.join(); + } + + auto default_tables = GetNumberOfSstFilesForColumnFamily(db_, "default"); + auto pikachu_tables = GetNumberOfSstFilesForColumnFamily(db_, "pikachu"); + ASSERT_LE(default_tables, static_cast(10)); + ASSERT_GT(default_tables, static_cast(0)); + ASSERT_LE(pikachu_tables, static_cast(10)); + ASSERT_GT(pikachu_tables, static_cast(0)); +} +#endif // ROCKSDB_LITE + +namespace { +class KeepFilter : public CompactionFilter { + public: + bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/, + std::string* /*new_value*/, + bool* /*value_changed*/) const override { + return false; + } + + const char* Name() const override { return "KeepFilter"; } +}; + +class KeepFilterFactory : public CompactionFilterFactory { + public: + explicit KeepFilterFactory(bool check_context = false) + : check_context_(check_context) {} + + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { + if (check_context_) { + EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction); + EXPECT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction); + } + return std::unique_ptr(new KeepFilter()); + } + + const char* Name() const override { return "KeepFilterFactory"; } + bool check_context_; + std::atomic_bool expect_full_compaction_; + std::atomic_bool expect_manual_compaction_; +}; + +class DelayFilter : public CompactionFilter { + public: + explicit DelayFilter(DBTestBase* d) : db_test(d) {} + bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/, + std::string* /*new_value*/, + bool* /*value_changed*/) const override { + db_test->env_->addon_time_.fetch_add(1000); + return true; + } + + const char* Name() const override { return "DelayFilter"; } + + private: + DBTestBase* db_test; +}; + +class DelayFilterFactory : public CompactionFilterFactory { + public: + explicit DelayFilterFactory(DBTestBase* d) : db_test(d) {} + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& /*context*/) override { + return std::unique_ptr(new DelayFilter(db_test)); + } + + const char* Name() const override { return "DelayFilterFactory"; } + + private: + DBTestBase* db_test; +}; +} // namespace + +#ifndef ROCKSDB_LITE + +static std::string CompressibleString(Random* rnd, int len) { + std::string r; + test::CompressibleString(rnd, 0.8, len, &r); + return r; +} +#endif // ROCKSDB_LITE + +TEST_F(DBTest, FailMoreDbPaths) { + Options options = CurrentOptions(); + options.db_paths.emplace_back(dbname_, 10000000); + options.db_paths.emplace_back(dbname_ + "_2", 1000000); + options.db_paths.emplace_back(dbname_ + "_3", 1000000); + options.db_paths.emplace_back(dbname_ + "_4", 1000000); + options.db_paths.emplace_back(dbname_ + "_5", 1000000); + ASSERT_TRUE(TryReopen(options).IsNotSupported()); +} + +void CheckColumnFamilyMeta( + const ColumnFamilyMetaData& cf_meta, + const std::vector>& files_by_level, + uint64_t start_time, uint64_t end_time) { + ASSERT_EQ(cf_meta.name, kDefaultColumnFamilyName); + ASSERT_EQ(cf_meta.levels.size(), files_by_level.size()); + + uint64_t cf_size = 0; + size_t file_count = 0; + + for (size_t i = 0; i < cf_meta.levels.size(); ++i) { + const auto& level_meta_from_cf = cf_meta.levels[i]; + const auto& level_meta_from_files = files_by_level[i]; + + ASSERT_EQ(level_meta_from_cf.level, i); + ASSERT_EQ(level_meta_from_cf.files.size(), level_meta_from_files.size()); + + file_count += level_meta_from_cf.files.size(); + + uint64_t level_size = 0; + for (size_t j = 0; j < level_meta_from_cf.files.size(); ++j) { + const auto& file_meta_from_cf = level_meta_from_cf.files[j]; + const auto& file_meta_from_files = level_meta_from_files[j]; + + level_size += file_meta_from_cf.size; + + ASSERT_EQ(file_meta_from_cf.file_number, + file_meta_from_files.fd.GetNumber()); + ASSERT_EQ(file_meta_from_cf.file_number, + TableFileNameToNumber(file_meta_from_cf.name)); + ASSERT_EQ(file_meta_from_cf.size, file_meta_from_files.fd.file_size); + ASSERT_EQ(file_meta_from_cf.smallest_seqno, + file_meta_from_files.fd.smallest_seqno); + ASSERT_EQ(file_meta_from_cf.largest_seqno, + file_meta_from_files.fd.largest_seqno); + ASSERT_EQ(file_meta_from_cf.smallestkey, + file_meta_from_files.smallest.user_key().ToString()); + ASSERT_EQ(file_meta_from_cf.largestkey, + file_meta_from_files.largest.user_key().ToString()); + ASSERT_EQ(file_meta_from_cf.oldest_blob_file_number, + file_meta_from_files.oldest_blob_file_number); + ASSERT_EQ(file_meta_from_cf.oldest_ancester_time, + file_meta_from_files.oldest_ancester_time); + ASSERT_EQ(file_meta_from_cf.file_creation_time, + file_meta_from_files.file_creation_time); + ASSERT_GE(file_meta_from_cf.file_creation_time, start_time); + ASSERT_LE(file_meta_from_cf.file_creation_time, end_time); + ASSERT_GE(file_meta_from_cf.oldest_ancester_time, start_time); + ASSERT_LE(file_meta_from_cf.oldest_ancester_time, end_time); + } + + ASSERT_EQ(level_meta_from_cf.size, level_size); + cf_size += level_size; + } + + ASSERT_EQ(cf_meta.file_count, file_count); + ASSERT_EQ(cf_meta.size, cf_size); +} + +void CheckLiveFilesMeta( + const std::vector& live_file_meta, + const std::vector>& files_by_level) { + size_t total_file_count = 0; + for (const auto& f : files_by_level) { + total_file_count += f.size(); + } + + ASSERT_EQ(live_file_meta.size(), total_file_count); + + int level = 0; + int i = 0; + + for (const auto& meta : live_file_meta) { + if (level != meta.level) { + level = meta.level; + i = 0; + } + + ASSERT_LT(i, files_by_level[level].size()); + + const auto& expected_meta = files_by_level[level][i]; + + ASSERT_EQ(meta.column_family_name, kDefaultColumnFamilyName); + ASSERT_EQ(meta.file_number, expected_meta.fd.GetNumber()); + ASSERT_EQ(meta.file_number, TableFileNameToNumber(meta.name)); + ASSERT_EQ(meta.size, expected_meta.fd.file_size); + ASSERT_EQ(meta.smallest_seqno, expected_meta.fd.smallest_seqno); + ASSERT_EQ(meta.largest_seqno, expected_meta.fd.largest_seqno); + ASSERT_EQ(meta.smallestkey, expected_meta.smallest.user_key().ToString()); + ASSERT_EQ(meta.largestkey, expected_meta.largest.user_key().ToString()); + ASSERT_EQ(meta.oldest_blob_file_number, + expected_meta.oldest_blob_file_number); + + ++i; + } +} + +#ifndef ROCKSDB_LITE +TEST_F(DBTest, MetaDataTest) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.disable_auto_compactions = true; + + int64_t temp_time = 0; + options.env->GetCurrentTime(&temp_time); + uint64_t start_time = static_cast(temp_time); + + DestroyAndReopen(options); + + Random rnd(301); + int key_index = 0; + for (int i = 0; i < 100; ++i) { + // Add a single blob reference to each file + std::string blob_index; + BlobIndex::EncodeBlob(&blob_index, /* blob_file_number */ i + 1000, + /* offset */ 1234, /* size */ 5678, kNoCompression); + + WriteBatch batch; + ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, Key(key_index), + blob_index)); + ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); + + ++key_index; + + // Fill up the rest of the file with random values. + GenerateNewFile(&rnd, &key_index, /* nowait */ true); + + Flush(); + } + + std::vector> files_by_level; + dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files_by_level); + + options.env->GetCurrentTime(&temp_time); + uint64_t end_time = static_cast(temp_time); + + ColumnFamilyMetaData cf_meta; + db_->GetColumnFamilyMetaData(&cf_meta); + CheckColumnFamilyMeta(cf_meta, files_by_level, start_time, end_time); + + std::vector live_file_meta; + db_->GetLiveFilesMetaData(&live_file_meta); + CheckLiveFilesMeta(live_file_meta, files_by_level); +} + +namespace { +void MinLevelHelper(DBTest* self, Options& options) { + Random rnd(301); + + for (int num = 0; num < options.level0_file_num_compaction_trigger - 1; + num++) { + std::vector values; + // Write 120KB (12 values, each 10K) + for (int i = 0; i < 12; i++) { + values.push_back(DBTestBase::RandomString(&rnd, 10000)); + ASSERT_OK(self->Put(DBTestBase::Key(i), values[i])); + } + self->dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(self->NumTableFilesAtLevel(0), num + 1); + } + + // generate one more file in level-0, and should trigger level-0 compaction + std::vector values; + for (int i = 0; i < 12; i++) { + values.push_back(DBTestBase::RandomString(&rnd, 10000)); + ASSERT_OK(self->Put(DBTestBase::Key(i), values[i])); + } + self->dbfull()->TEST_WaitForCompact(); + + ASSERT_EQ(self->NumTableFilesAtLevel(0), 0); + ASSERT_EQ(self->NumTableFilesAtLevel(1), 1); +} + +// returns false if the calling-Test should be skipped +bool MinLevelToCompress(CompressionType& type, Options& options, int wbits, + int lev, int strategy) { + fprintf(stderr, + "Test with compression options : window_bits = %d, level = %d, " + "strategy = %d}\n", + wbits, lev, strategy); + options.write_buffer_size = 100 << 10; // 100KB + options.arena_block_size = 4096; + options.num_levels = 3; + options.level0_file_num_compaction_trigger = 3; + options.create_if_missing = true; + + if (Snappy_Supported()) { + type = kSnappyCompression; + fprintf(stderr, "using snappy\n"); + } else if (Zlib_Supported()) { + type = kZlibCompression; + fprintf(stderr, "using zlib\n"); + } else if (BZip2_Supported()) { + type = kBZip2Compression; + fprintf(stderr, "using bzip2\n"); + } else if (LZ4_Supported()) { + type = kLZ4Compression; + fprintf(stderr, "using lz4\n"); + } else if (XPRESS_Supported()) { + type = kXpressCompression; + fprintf(stderr, "using xpress\n"); + } else if (ZSTD_Supported()) { + type = kZSTD; + fprintf(stderr, "using ZSTD\n"); + } else { + fprintf(stderr, "skipping test, compression disabled\n"); + return false; + } + options.compression_per_level.resize(options.num_levels); + + // do not compress L0 + for (int i = 0; i < 1; i++) { + options.compression_per_level[i] = kNoCompression; + } + for (int i = 1; i < options.num_levels; i++) { + options.compression_per_level[i] = type; + } + return true; +} +} // namespace + +TEST_F(DBTest, MinLevelToCompress1) { + Options options = CurrentOptions(); + CompressionType type = kSnappyCompression; + if (!MinLevelToCompress(type, options, -14, -1, 0)) { + return; + } + Reopen(options); + MinLevelHelper(this, options); + + // do not compress L0 and L1 + for (int i = 0; i < 2; i++) { + options.compression_per_level[i] = kNoCompression; + } + for (int i = 2; i < options.num_levels; i++) { + options.compression_per_level[i] = type; + } + DestroyAndReopen(options); + MinLevelHelper(this, options); +} + +TEST_F(DBTest, MinLevelToCompress2) { + Options options = CurrentOptions(); + CompressionType type = kSnappyCompression; + if (!MinLevelToCompress(type, options, 15, -1, 0)) { + return; + } + Reopen(options); + MinLevelHelper(this, options); + + // do not compress L0 and L1 + for (int i = 0; i < 2; i++) { + options.compression_per_level[i] = kNoCompression; + } + for (int i = 2; i < options.num_levels; i++) { + options.compression_per_level[i] = type; + } + DestroyAndReopen(options); + MinLevelHelper(this, options); +} + +// This test may fail because of a legit case that multiple L0 files +// are trivial moved to L1. +TEST_F(DBTest, DISABLED_RepeatedWritesToSameKey) { + do { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 100000; // Small write buffer + CreateAndReopenWithCF({"pikachu"}, options); + + // We must have at most one file per level except for level-0, + // which may have up to kL0_StopWritesTrigger files. + const int kMaxFiles = + options.num_levels + options.level0_stop_writes_trigger; + + Random rnd(301); + std::string value = + RandomString(&rnd, static_cast(2 * options.write_buffer_size)); + for (int i = 0; i < 5 * kMaxFiles; i++) { + ASSERT_OK(Put(1, "key", value)); + ASSERT_LE(TotalTableFiles(1), kMaxFiles); + } + } while (ChangeCompactOptions()); +} +#endif // ROCKSDB_LITE + +TEST_F(DBTest, SparseMerge) { + do { + Options options = CurrentOptions(); + options.compression = kNoCompression; + CreateAndReopenWithCF({"pikachu"}, options); + + FillLevels("A", "Z", 1); + + // Suppose there is: + // small amount of data with prefix A + // large amount of data with prefix B + // small amount of data with prefix C + // and that recent updates have made small changes to all three prefixes. + // Check that we do not do a compaction that merges all of B in one shot. + const std::string value(1000, 'x'); + Put(1, "A", "va"); + // Write approximately 100MB of "B" values + for (int i = 0; i < 100000; i++) { + char key[100]; + snprintf(key, sizeof(key), "B%010d", i); + Put(1, key, value); + } + Put(1, "C", "vc"); + ASSERT_OK(Flush(1)); + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + + // Make sparse update + Put(1, "A", "va2"); + Put(1, "B100", "bvalue2"); + Put(1, "C", "vc2"); + ASSERT_OK(Flush(1)); + + // Compactions should not cause us to create a situation where + // a file overlaps too much data at the next level. + ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]), + 20 * 1048576); + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]), + 20 * 1048576); + dbfull()->TEST_CompactRange(1, nullptr, nullptr); + ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]), + 20 * 1048576); + } while (ChangeCompactOptions()); +} + +#ifndef ROCKSDB_LITE +static bool Between(uint64_t val, uint64_t low, uint64_t high) { + bool result = (val >= low) && (val <= high); + if (!result) { + fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n", + (unsigned long long)(val), (unsigned long long)(low), + (unsigned long long)(high)); + } + return result; +} + +TEST_F(DBTest, ApproximateSizesMemTable) { + Options options = CurrentOptions(); + options.write_buffer_size = 100000000; // Large write buffer + options.compression = kNoCompression; + options.create_if_missing = true; + DestroyAndReopen(options); + auto default_cf = db_->DefaultColumnFamily(); + + const int N = 128; + Random rnd(301); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); + } + + uint64_t size; + std::string start = Key(50); + std::string end = Key(60); + Range r(start, end); + SizeApproximationOptions size_approx_options; + size_approx_options.include_memtabtles = true; + size_approx_options.include_files = true; + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + ASSERT_GT(size, 6000); + ASSERT_LT(size, 204800); + // Zero if not including mem table + db_->GetApproximateSizes(&r, 1, &size); + ASSERT_EQ(size, 0); + + start = Key(500); + end = Key(600); + r = Range(start, end); + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + ASSERT_EQ(size, 0); + + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(1000 + i), RandomString(&rnd, 1024))); + } + + start = Key(500); + end = Key(600); + r = Range(start, end); + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + ASSERT_EQ(size, 0); + + start = Key(100); + end = Key(1020); + r = Range(start, end); + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + ASSERT_GT(size, 6000); + + options.max_write_buffer_number = 8; + options.min_write_buffer_number_to_merge = 5; + options.write_buffer_size = 1024 * N; // Not very large + DestroyAndReopen(options); + default_cf = db_->DefaultColumnFamily(); + + int keys[N * 3]; + for (int i = 0; i < N; i++) { + keys[i * 3] = i * 5; + keys[i * 3 + 1] = i * 5 + 1; + keys[i * 3 + 2] = i * 5 + 2; + } + std::random_shuffle(std::begin(keys), std::end(keys)); + + for (int i = 0; i < N * 3; i++) { + ASSERT_OK(Put(Key(keys[i] + 1000), RandomString(&rnd, 1024))); + } + + start = Key(100); + end = Key(300); + r = Range(start, end); + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + ASSERT_EQ(size, 0); + + start = Key(1050); + end = Key(1080); + r = Range(start, end); + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + ASSERT_GT(size, 6000); + + start = Key(2100); + end = Key(2300); + r = Range(start, end); + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + ASSERT_EQ(size, 0); + + start = Key(1050); + end = Key(1080); + r = Range(start, end); + uint64_t size_with_mt, size_without_mt; + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, + &size_with_mt); + ASSERT_GT(size_with_mt, 6000); + db_->GetApproximateSizes(&r, 1, &size_without_mt); + ASSERT_EQ(size_without_mt, 0); + + Flush(); + + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(i + 1000), RandomString(&rnd, 1024))); + } + + start = Key(1050); + end = Key(1080); + r = Range(start, end); + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, + &size_with_mt); + db_->GetApproximateSizes(&r, 1, &size_without_mt); + ASSERT_GT(size_with_mt, size_without_mt); + ASSERT_GT(size_without_mt, 6000); + + // Check that include_memtabtles flag works as expected + size_approx_options.include_memtabtles = false; + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + ASSERT_EQ(size, size_without_mt); + + // Check that files_size_error_margin works as expected, when the heuristic + // conditions are not met + start = Key(1); + end = Key(1000 + N - 2); + r = Range(start, end); + size_approx_options.files_size_error_margin = -1.0; // disabled + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + uint64_t size2; + size_approx_options.files_size_error_margin = 0.5; // enabled, but not used + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2); + ASSERT_EQ(size, size2); +} + +TEST_F(DBTest, ApproximateSizesFilesWithErrorMargin) { + Options options = CurrentOptions(); + options.write_buffer_size = 1024 * 1024; + options.compression = kNoCompression; + options.create_if_missing = true; + options.target_file_size_base = 1024 * 1024; + DestroyAndReopen(options); + const auto default_cf = db_->DefaultColumnFamily(); + + const int N = 64000; + Random rnd(301); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); + } + // Flush everything to files + Flush(); + // Compact the entire key space into the next level + db_->CompactRange(CompactRangeOptions(), default_cf, nullptr, nullptr); + + // Write more keys + for (int i = N; i < (N + N / 4); i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); + } + // Flush everything to files again + Flush(); + + // Wait for compaction to finish + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + const std::string start = Key(0); + const std::string end = Key(2 * N); + const Range r(start, end); + + SizeApproximationOptions size_approx_options; + size_approx_options.include_memtabtles = false; + size_approx_options.include_files = true; + size_approx_options.files_size_error_margin = -1.0; // disabled + + // Get the precise size without any approximation heuristic + uint64_t size; + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + ASSERT_NE(size, 0); + + // Get the size with an approximation heuristic + uint64_t size2; + const double error_margin = 0.2; + size_approx_options.files_size_error_margin = error_margin; + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2); + ASSERT_LT(size2, size * (1 + error_margin)); + ASSERT_GT(size2, size * (1 - error_margin)); +} + +TEST_F(DBTest, GetApproximateMemTableStats) { + Options options = CurrentOptions(); + options.write_buffer_size = 100000000; + options.compression = kNoCompression; + options.create_if_missing = true; + DestroyAndReopen(options); + + const int N = 128; + Random rnd(301); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); + } + + uint64_t count; + uint64_t size; + + std::string start = Key(50); + std::string end = Key(60); + Range r(start, end); + db_->GetApproximateMemTableStats(r, &count, &size); + ASSERT_GT(count, 0); + ASSERT_LE(count, N); + ASSERT_GT(size, 6000); + ASSERT_LT(size, 204800); + + start = Key(500); + end = Key(600); + r = Range(start, end); + db_->GetApproximateMemTableStats(r, &count, &size); + ASSERT_EQ(count, 0); + ASSERT_EQ(size, 0); + + Flush(); + + start = Key(50); + end = Key(60); + r = Range(start, end); + db_->GetApproximateMemTableStats(r, &count, &size); + ASSERT_EQ(count, 0); + ASSERT_EQ(size, 0); + + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(1000 + i), RandomString(&rnd, 1024))); + } + + start = Key(100); + end = Key(1020); + r = Range(start, end); + db_->GetApproximateMemTableStats(r, &count, &size); + ASSERT_GT(count, 20); + ASSERT_GT(size, 6000); +} + +TEST_F(DBTest, ApproximateSizes) { + do { + Options options = CurrentOptions(); + options.write_buffer_size = 100000000; // Large write buffer + options.compression = kNoCompression; + options.create_if_missing = true; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0)); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0)); + + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + const int N = 80; + static const int S1 = 100000; + static const int S2 = 105000; // Allow some expansion from metadata + Random rnd(301); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(1, Key(i), RandomString(&rnd, S1))); + } + + // 0 because GetApproximateSizes() does not account for memtable space + ASSERT_TRUE(Between(Size("", Key(50), 1), 0, 0)); + + // Check sizes across recovery by reopening a few times + for (int run = 0; run < 3; run++) { + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + for (int compact_start = 0; compact_start < N; compact_start += 10) { + for (int i = 0; i < N; i += 10) { + ASSERT_TRUE(Between(Size("", Key(i), 1), S1 * i, S2 * i)); + ASSERT_TRUE(Between(Size("", Key(i) + ".suffix", 1), S1 * (i + 1), + S2 * (i + 1))); + ASSERT_TRUE(Between(Size(Key(i), Key(i + 10), 1), S1 * 10, S2 * 10)); + } + ASSERT_TRUE(Between(Size("", Key(50), 1), S1 * 50, S2 * 50)); + ASSERT_TRUE( + Between(Size("", Key(50) + ".suffix", 1), S1 * 50, S2 * 50)); + + std::string cstart_str = Key(compact_start); + std::string cend_str = Key(compact_start + 9); + Slice cstart = cstart_str; + Slice cend = cend_str; + dbfull()->TEST_CompactRange(0, &cstart, &cend, handles_[1]); + } + + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + ASSERT_GT(NumTableFilesAtLevel(1, 1), 0); + } + // ApproximateOffsetOf() is not yet implemented in plain table format. + } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction | + kSkipPlainTable | kSkipHashIndex)); +} + +TEST_F(DBTest, ApproximateSizes_MixOfSmallAndLarge) { + do { + Options options = CurrentOptions(); + options.compression = kNoCompression; + CreateAndReopenWithCF({"pikachu"}, options); + + Random rnd(301); + std::string big1 = RandomString(&rnd, 100000); + ASSERT_OK(Put(1, Key(0), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(1), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(2), big1)); + ASSERT_OK(Put(1, Key(3), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(4), big1)); + ASSERT_OK(Put(1, Key(5), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(6), RandomString(&rnd, 300000))); + ASSERT_OK(Put(1, Key(7), RandomString(&rnd, 10000))); + + // Check sizes across recovery by reopening a few times + for (int run = 0; run < 3; run++) { + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + ASSERT_TRUE(Between(Size("", Key(0), 1), 0, 0)); + ASSERT_TRUE(Between(Size("", Key(1), 1), 10000, 11000)); + ASSERT_TRUE(Between(Size("", Key(2), 1), 20000, 21000)); + ASSERT_TRUE(Between(Size("", Key(3), 1), 120000, 121000)); + ASSERT_TRUE(Between(Size("", Key(4), 1), 130000, 131000)); + ASSERT_TRUE(Between(Size("", Key(5), 1), 230000, 231000)); + ASSERT_TRUE(Between(Size("", Key(6), 1), 240000, 241000)); + ASSERT_TRUE(Between(Size("", Key(7), 1), 540000, 541000)); + ASSERT_TRUE(Between(Size("", Key(8), 1), 550000, 560000)); + + ASSERT_TRUE(Between(Size(Key(3), Key(5), 1), 110000, 111000)); + + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + } + // ApproximateOffsetOf() is not yet implemented in plain table format. + } while (ChangeOptions(kSkipPlainTable)); +} +#endif // ROCKSDB_LITE + +#ifndef ROCKSDB_LITE +TEST_F(DBTest, Snapshot) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override)); + Put(0, "foo", "0v1"); + Put(1, "foo", "1v1"); + + const Snapshot* s1 = db_->GetSnapshot(); + ASSERT_EQ(1U, GetNumSnapshots()); + uint64_t time_snap1 = GetTimeOldestSnapshots(); + ASSERT_GT(time_snap1, 0U); + ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber()); + Put(0, "foo", "0v2"); + Put(1, "foo", "1v2"); + + env_->addon_time_.fetch_add(1); + + const Snapshot* s2 = db_->GetSnapshot(); + ASSERT_EQ(2U, GetNumSnapshots()); + ASSERT_EQ(time_snap1, GetTimeOldestSnapshots()); + ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber()); + Put(0, "foo", "0v3"); + Put(1, "foo", "1v3"); + + { + ManagedSnapshot s3(db_); + ASSERT_EQ(3U, GetNumSnapshots()); + ASSERT_EQ(time_snap1, GetTimeOldestSnapshots()); + ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber()); + + Put(0, "foo", "0v4"); + Put(1, "foo", "1v4"); + ASSERT_EQ("0v1", Get(0, "foo", s1)); + ASSERT_EQ("1v1", Get(1, "foo", s1)); + ASSERT_EQ("0v2", Get(0, "foo", s2)); + ASSERT_EQ("1v2", Get(1, "foo", s2)); + ASSERT_EQ("0v3", Get(0, "foo", s3.snapshot())); + ASSERT_EQ("1v3", Get(1, "foo", s3.snapshot())); + ASSERT_EQ("0v4", Get(0, "foo")); + ASSERT_EQ("1v4", Get(1, "foo")); + } + + ASSERT_EQ(2U, GetNumSnapshots()); + ASSERT_EQ(time_snap1, GetTimeOldestSnapshots()); + ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber()); + ASSERT_EQ("0v1", Get(0, "foo", s1)); + ASSERT_EQ("1v1", Get(1, "foo", s1)); + ASSERT_EQ("0v2", Get(0, "foo", s2)); + ASSERT_EQ("1v2", Get(1, "foo", s2)); + ASSERT_EQ("0v4", Get(0, "foo")); + ASSERT_EQ("1v4", Get(1, "foo")); + + db_->ReleaseSnapshot(s1); + ASSERT_EQ("0v2", Get(0, "foo", s2)); + ASSERT_EQ("1v2", Get(1, "foo", s2)); + ASSERT_EQ("0v4", Get(0, "foo")); + ASSERT_EQ("1v4", Get(1, "foo")); + ASSERT_EQ(1U, GetNumSnapshots()); + ASSERT_LT(time_snap1, GetTimeOldestSnapshots()); + ASSERT_EQ(GetSequenceOldestSnapshots(), s2->GetSequenceNumber()); + + db_->ReleaseSnapshot(s2); + ASSERT_EQ(0U, GetNumSnapshots()); + ASSERT_EQ(GetSequenceOldestSnapshots(), 0); + ASSERT_EQ("0v4", Get(0, "foo")); + ASSERT_EQ("1v4", Get(1, "foo")); + } while (ChangeOptions()); +} + +TEST_F(DBTest, HiddenValuesAreRemoved) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; + do { + Options options = CurrentOptions(options_override); + CreateAndReopenWithCF({"pikachu"}, options); + Random rnd(301); + FillLevels("a", "z", 1); + + std::string big = RandomString(&rnd, 50000); + Put(1, "foo", big); + Put(1, "pastfoo", "v"); + const Snapshot* snapshot = db_->GetSnapshot(); + Put(1, "foo", "tiny"); + Put(1, "pastfoo2", "v2"); // Advance sequence number one more + + ASSERT_OK(Flush(1)); + ASSERT_GT(NumTableFilesAtLevel(0, 1), 0); + + ASSERT_EQ(big, Get(1, "foo", snapshot)); + ASSERT_TRUE(Between(Size("", "pastfoo", 1), 50000, 60000)); + db_->ReleaseSnapshot(snapshot); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny, " + big + " ]"); + Slice x("x"); + dbfull()->TEST_CompactRange(0, nullptr, &x, handles_[1]); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]"); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + ASSERT_GE(NumTableFilesAtLevel(1, 1), 1); + dbfull()->TEST_CompactRange(1, nullptr, &x, handles_[1]); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]"); + + ASSERT_TRUE(Between(Size("", "pastfoo", 1), 0, 1000)); + // ApproximateOffsetOf() is not yet implemented in plain table format, + // which is used by Size(). + } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction | + kSkipPlainTable)); +} +#endif // ROCKSDB_LITE + +TEST_F(DBTest, UnremovableSingleDelete) { + // If we compact: + // + // Put(A, v1) Snapshot SingleDelete(A) Put(A, v2) + // + // We do not want to end up with: + // + // Put(A, v1) Snapshot Put(A, v2) + // + // Because a subsequent SingleDelete(A) would delete the Put(A, v2) + // but not Put(A, v1), so Get(A) would return v1. + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; + do { + Options options = CurrentOptions(options_override); + options.disable_auto_compactions = true; + CreateAndReopenWithCF({"pikachu"}, options); + + Put(1, "foo", "first"); + const Snapshot* snapshot = db_->GetSnapshot(); + SingleDelete(1, "foo"); + Put(1, "foo", "second"); + ASSERT_OK(Flush(1)); + + ASSERT_EQ("first", Get(1, "foo", snapshot)); + ASSERT_EQ("second", Get(1, "foo")); + + dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr); + ASSERT_EQ("[ second, SDEL, first ]", AllEntriesFor("foo", 1)); + + SingleDelete(1, "foo"); + + ASSERT_EQ("first", Get(1, "foo", snapshot)); + ASSERT_EQ("NOT_FOUND", Get(1, "foo")); + + dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr); + + ASSERT_EQ("first", Get(1, "foo", snapshot)); + ASSERT_EQ("NOT_FOUND", Get(1, "foo")); + db_->ReleaseSnapshot(snapshot); + // Skip FIFO and universal compaction beccause they do not apply to the test + // case. Skip MergePut because single delete does not get removed when it + // encounters a merge. + } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction | + kSkipMergePut)); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBTest, DeletionMarkers1) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + Put(1, "foo", "v1"); + ASSERT_OK(Flush(1)); + const int last = 2; + MoveFilesToLevel(last, 1); + // foo => v1 is now in last level + ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1); + + // Place a table at level last-1 to prevent merging with preceding mutation + Put(1, "a", "begin"); + Put(1, "z", "end"); + Flush(1); + MoveFilesToLevel(last - 1, 1); + ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1); + ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1); + + Delete(1, "foo"); + Put(1, "foo", "v2"); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]"); + ASSERT_OK(Flush(1)); // Moves to level last-2 + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]"); + Slice z("z"); + dbfull()->TEST_CompactRange(last - 2, nullptr, &z, handles_[1]); + // DEL eliminated, but v1 remains because we aren't compacting that level + // (DEL can be eliminated because v2 hides v1). + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]"); + dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]); + // Merging last-1 w/ last, so we are the base level for "foo", so + // DEL is removed. (as is v1). + ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]"); +} + +TEST_F(DBTest, DeletionMarkers2) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + Put(1, "foo", "v1"); + ASSERT_OK(Flush(1)); + const int last = 2; + MoveFilesToLevel(last, 1); + // foo => v1 is now in last level + ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1); + + // Place a table at level last-1 to prevent merging with preceding mutation + Put(1, "a", "begin"); + Put(1, "z", "end"); + Flush(1); + MoveFilesToLevel(last - 1, 1); + ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1); + ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1); + + Delete(1, "foo"); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]"); + ASSERT_OK(Flush(1)); // Moves to level last-2 + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]"); + dbfull()->TEST_CompactRange(last - 2, nullptr, nullptr, handles_[1]); + // DEL kept: "last" file overlaps + ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]"); + dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]); + // Merging last-1 w/ last, so we are the base level for "foo", so + // DEL is removed. (as is v1). + ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); +} + +TEST_F(DBTest, OverlapInLevel0) { + do { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + + // Fill levels 1 and 2 to disable the pushing of new memtables to levels > + // 0. + ASSERT_OK(Put(1, "100", "v100")); + ASSERT_OK(Put(1, "999", "v999")); + Flush(1); + MoveFilesToLevel(2, 1); + ASSERT_OK(Delete(1, "100")); + ASSERT_OK(Delete(1, "999")); + Flush(1); + MoveFilesToLevel(1, 1); + ASSERT_EQ("0,1,1", FilesPerLevel(1)); + + // Make files spanning the following ranges in level-0: + // files[0] 200 .. 900 + // files[1] 300 .. 500 + // Note that files are sorted by smallest key. + ASSERT_OK(Put(1, "300", "v300")); + ASSERT_OK(Put(1, "500", "v500")); + Flush(1); + ASSERT_OK(Put(1, "200", "v200")); + ASSERT_OK(Put(1, "600", "v600")); + ASSERT_OK(Put(1, "900", "v900")); + Flush(1); + ASSERT_EQ("2,1,1", FilesPerLevel(1)); + + // Compact away the placeholder files we created initially + dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); + dbfull()->TEST_CompactRange(2, nullptr, nullptr, handles_[1]); + ASSERT_EQ("2", FilesPerLevel(1)); + + // Do a memtable compaction. Before bug-fix, the compaction would + // not detect the overlap with level-0 files and would incorrectly place + // the deletion in a deeper level. + ASSERT_OK(Delete(1, "600")); + Flush(1); + ASSERT_EQ("3", FilesPerLevel(1)); + ASSERT_EQ("NOT_FOUND", Get(1, "600")); + } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction)); +} +#endif // ROCKSDB_LITE + +TEST_F(DBTest, ComparatorCheck) { + class NewComparator : public Comparator { + public: + const char* Name() const override { return "rocksdb.NewComparator"; } + int Compare(const Slice& a, const Slice& b) const override { + return BytewiseComparator()->Compare(a, b); + } + void FindShortestSeparator(std::string* s, const Slice& l) const override { + BytewiseComparator()->FindShortestSeparator(s, l); + } + void FindShortSuccessor(std::string* key) const override { + BytewiseComparator()->FindShortSuccessor(key); + } + }; + Options new_options, options; + NewComparator cmp; + do { + options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + new_options = CurrentOptions(); + new_options.comparator = &cmp; + // only the non-default column family has non-matching comparator + Status s = TryReopenWithColumnFamilies( + {"default", "pikachu"}, std::vector({options, new_options})); + ASSERT_TRUE(!s.ok()); + ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos) + << s.ToString(); + } while (ChangeCompactOptions()); +} + +TEST_F(DBTest, CustomComparator) { + class NumberComparator : public Comparator { + public: + const char* Name() const override { return "test.NumberComparator"; } + int Compare(const Slice& a, const Slice& b) const override { + return ToNumber(a) - ToNumber(b); + } + void FindShortestSeparator(std::string* s, const Slice& l) const override { + ToNumber(*s); // Check format + ToNumber(l); // Check format + } + void FindShortSuccessor(std::string* key) const override { + ToNumber(*key); // Check format + } + + private: + static int ToNumber(const Slice& x) { + // Check that there are no extra characters. + EXPECT_TRUE(x.size() >= 2 && x[0] == '[' && x[x.size() - 1] == ']') + << EscapeString(x); + int val; + char ignored; + EXPECT_TRUE(sscanf(x.ToString().c_str(), "[%i]%c", &val, &ignored) == 1) + << EscapeString(x); + return val; + } + }; + Options new_options; + NumberComparator cmp; + do { + new_options = CurrentOptions(); + new_options.create_if_missing = true; + new_options.comparator = &cmp; + new_options.write_buffer_size = 4096; // Compact more often + new_options.arena_block_size = 4096; + new_options = CurrentOptions(new_options); + DestroyAndReopen(new_options); + CreateAndReopenWithCF({"pikachu"}, new_options); + ASSERT_OK(Put(1, "[10]", "ten")); + ASSERT_OK(Put(1, "[0x14]", "twenty")); + for (int i = 0; i < 2; i++) { + ASSERT_EQ("ten", Get(1, "[10]")); + ASSERT_EQ("ten", Get(1, "[0xa]")); + ASSERT_EQ("twenty", Get(1, "[20]")); + ASSERT_EQ("twenty", Get(1, "[0x14]")); + ASSERT_EQ("NOT_FOUND", Get(1, "[15]")); + ASSERT_EQ("NOT_FOUND", Get(1, "[0xf]")); + Compact(1, "[0]", "[9999]"); + } + + for (int run = 0; run < 2; run++) { + for (int i = 0; i < 1000; i++) { + char buf[100]; + snprintf(buf, sizeof(buf), "[%d]", i * 10); + ASSERT_OK(Put(1, buf, buf)); + } + Compact(1, "[0]", "[1000000]"); + } + } while (ChangeCompactOptions()); +} + +TEST_F(DBTest, DBOpen_Options) { + Options options = CurrentOptions(); + std::string dbname = test::PerThreadDBPath("db_options_test"); + ASSERT_OK(DestroyDB(dbname, options)); + + // Does not exist, and create_if_missing == false: error + DB* db = nullptr; + options.create_if_missing = false; + Status s = DB::Open(options, dbname, &db); + ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr); + ASSERT_TRUE(db == nullptr); + + // Does not exist, and create_if_missing == true: OK + options.create_if_missing = true; + s = DB::Open(options, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + + delete db; + db = nullptr; + + // Does exist, and error_if_exists == true: error + options.create_if_missing = false; + options.error_if_exists = true; + s = DB::Open(options, dbname, &db); + ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr); + ASSERT_TRUE(db == nullptr); + + // Does exist, and error_if_exists == false: OK + options.create_if_missing = true; + options.error_if_exists = false; + s = DB::Open(options, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + + delete db; + db = nullptr; +} + +TEST_F(DBTest, DBOpen_Change_NumLevels) { + Options options = CurrentOptions(); + options.create_if_missing = true; + DestroyAndReopen(options); + ASSERT_TRUE(db_ != nullptr); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "a", "123")); + ASSERT_OK(Put(1, "b", "234")); + Flush(1); + MoveFilesToLevel(3, 1); + Close(); + + options.create_if_missing = false; + options.num_levels = 2; + Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_TRUE(strstr(s.ToString().c_str(), "Invalid argument") != nullptr); + ASSERT_TRUE(db_ == nullptr); +} + +TEST_F(DBTest, DestroyDBMetaDatabase) { + std::string dbname = test::PerThreadDBPath("db_meta"); + ASSERT_OK(env_->CreateDirIfMissing(dbname)); + std::string metadbname = MetaDatabaseName(dbname, 0); + ASSERT_OK(env_->CreateDirIfMissing(metadbname)); + std::string metametadbname = MetaDatabaseName(metadbname, 0); + ASSERT_OK(env_->CreateDirIfMissing(metametadbname)); + + // Destroy previous versions if they exist. Using the long way. + Options options = CurrentOptions(); + ASSERT_OK(DestroyDB(metametadbname, options)); + ASSERT_OK(DestroyDB(metadbname, options)); + ASSERT_OK(DestroyDB(dbname, options)); + + // Setup databases + DB* db = nullptr; + ASSERT_OK(DB::Open(options, dbname, &db)); + delete db; + db = nullptr; + ASSERT_OK(DB::Open(options, metadbname, &db)); + delete db; + db = nullptr; + ASSERT_OK(DB::Open(options, metametadbname, &db)); + delete db; + db = nullptr; + + // Delete databases + ASSERT_OK(DestroyDB(dbname, options)); + + // Check if deletion worked. + options.create_if_missing = false; + ASSERT_TRUE(!(DB::Open(options, dbname, &db)).ok()); + ASSERT_TRUE(!(DB::Open(options, metadbname, &db)).ok()); + ASSERT_TRUE(!(DB::Open(options, metametadbname, &db)).ok()); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBTest, SnapshotFiles) { + do { + Options options = CurrentOptions(); + options.write_buffer_size = 100000000; // Large write buffer + CreateAndReopenWithCF({"pikachu"}, options); + + Random rnd(301); + + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + std::vector values; + for (int i = 0; i < 80; i++) { + values.push_back(RandomString(&rnd, 100000)); + ASSERT_OK(Put((i < 40), Key(i), values[i])); + } + + // assert that nothing makes it to disk yet. + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + + // get a file snapshot + uint64_t manifest_number = 0; + uint64_t manifest_size = 0; + std::vector files; + dbfull()->DisableFileDeletions(); + dbfull()->GetLiveFiles(files, &manifest_size); + + // CURRENT, MANIFEST, OPTIONS, *.sst files (one for each CF) + ASSERT_EQ(files.size(), 5U); + + uint64_t number = 0; + FileType type; + + // copy these files to a new snapshot directory + std::string snapdir = dbname_ + ".snapdir/"; + ASSERT_OK(env_->CreateDirIfMissing(snapdir)); + + for (size_t i = 0; i < files.size(); i++) { + // our clients require that GetLiveFiles returns + // files with "/" as first character! + ASSERT_EQ(files[i][0], '/'); + std::string src = dbname_ + files[i]; + std::string dest = snapdir + files[i]; + + uint64_t size; + ASSERT_OK(env_->GetFileSize(src, &size)); + + // record the number and the size of the + // latest manifest file + if (ParseFileName(files[i].substr(1), &number, &type)) { + if (type == kDescriptorFile) { + if (number > manifest_number) { + manifest_number = number; + ASSERT_GE(size, manifest_size); + size = manifest_size; // copy only valid MANIFEST data + } + } + } + CopyFile(src, dest, size); + } + + // release file snapshot + dbfull()->DisableFileDeletions(); + // overwrite one key, this key should not appear in the snapshot + std::vector extras; + for (unsigned int i = 0; i < 1; i++) { + extras.push_back(RandomString(&rnd, 100000)); + ASSERT_OK(Put(0, Key(i), extras[i])); + } + + // verify that data in the snapshot are correct + std::vector column_families; + column_families.emplace_back("default", ColumnFamilyOptions()); + column_families.emplace_back("pikachu", ColumnFamilyOptions()); + std::vector cf_handles; + DB* snapdb; + DBOptions opts; + opts.env = env_; + opts.create_if_missing = false; + Status stat = + DB::Open(opts, snapdir, column_families, &cf_handles, &snapdb); + ASSERT_OK(stat); + + ReadOptions roptions; + std::string val; + for (unsigned int i = 0; i < 80; i++) { + stat = snapdb->Get(roptions, cf_handles[i < 40], Key(i), &val); + ASSERT_EQ(values[i].compare(val), 0); + } + for (auto cfh : cf_handles) { + delete cfh; + } + delete snapdb; + + // look at the new live files after we added an 'extra' key + // and after we took the first snapshot. + uint64_t new_manifest_number = 0; + uint64_t new_manifest_size = 0; + std::vector newfiles; + dbfull()->DisableFileDeletions(); + dbfull()->GetLiveFiles(newfiles, &new_manifest_size); + + // find the new manifest file. assert that this manifest file is + // the same one as in the previous snapshot. But its size should be + // larger because we added an extra key after taking the + // previous shapshot. + for (size_t i = 0; i < newfiles.size(); i++) { + std::string src = dbname_ + "/" + newfiles[i]; + // record the lognumber and the size of the + // latest manifest file + if (ParseFileName(newfiles[i].substr(1), &number, &type)) { + if (type == kDescriptorFile) { + if (number > new_manifest_number) { + uint64_t size; + new_manifest_number = number; + ASSERT_OK(env_->GetFileSize(src, &size)); + ASSERT_GE(size, new_manifest_size); + } + } + } + } + ASSERT_EQ(manifest_number, new_manifest_number); + ASSERT_GT(new_manifest_size, manifest_size); + + // release file snapshot + dbfull()->DisableFileDeletions(); + } while (ChangeCompactOptions()); +} + +TEST_F(DBTest, ReadonlyDBGetLiveManifestSize) { + do { + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 2; + DestroyAndReopen(options); + + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + Close(); + ASSERT_OK(ReadOnlyReopen(options)); + + uint64_t manifest_size = 0; + std::vector files; + dbfull()->GetLiveFiles(files, &manifest_size); + + for (const std::string& f : files) { + uint64_t number = 0; + FileType type; + if (ParseFileName(f.substr(1), &number, &type)) { + if (type == kDescriptorFile) { + uint64_t size_on_disk; + env_->GetFileSize(dbname_ + "/" + f, &size_on_disk); + ASSERT_EQ(manifest_size, size_on_disk); + break; + } + } + } + Close(); + } while (ChangeCompactOptions()); +} +#endif + +TEST_F(DBTest, PurgeInfoLogs) { + Options options = CurrentOptions(); + options.keep_log_file_num = 5; + options.create_if_missing = true; + for (int mode = 0; mode <= 1; mode++) { + if (mode == 1) { + options.db_log_dir = dbname_ + "_logs"; + env_->CreateDirIfMissing(options.db_log_dir); + } else { + options.db_log_dir = ""; + } + for (int i = 0; i < 8; i++) { + Reopen(options); + } + + std::vector files; + env_->GetChildren(options.db_log_dir.empty() ? dbname_ : options.db_log_dir, + &files); + int info_log_count = 0; + for (std::string file : files) { + if (file.find("LOG") != std::string::npos) { + info_log_count++; + } + } + ASSERT_EQ(5, info_log_count); + + Destroy(options); + // For mode (1), test DestroyDB() to delete all the logs under DB dir. + // For mode (2), no info log file should have been put under DB dir. + std::vector db_files; + env_->GetChildren(dbname_, &db_files); + for (std::string file : db_files) { + ASSERT_TRUE(file.find("LOG") == std::string::npos); + } + + if (mode == 1) { + // Cleaning up + env_->GetChildren(options.db_log_dir, &files); + for (std::string file : files) { + env_->DeleteFile(options.db_log_dir + "/" + file); + } + env_->DeleteDir(options.db_log_dir); + } + } +} + +#ifndef ROCKSDB_LITE +// Multi-threaded test: +namespace { + +static const int kColumnFamilies = 10; +static const int kNumThreads = 10; +static const int kTestSeconds = 10; +static const int kNumKeys = 1000; + +struct MTState { + DBTest* test; + std::atomic stop; + std::atomic counter[kNumThreads]; + std::atomic thread_done[kNumThreads]; +}; + +struct MTThread { + MTState* state; + int id; + bool multiget_batched; +}; + +static void MTThreadBody(void* arg) { + MTThread* t = reinterpret_cast(arg); + int id = t->id; + DB* db = t->state->test->db_; + int counter = 0; + fprintf(stderr, "... starting thread %d\n", id); + Random rnd(1000 + id); + char valbuf[1500]; + while (t->state->stop.load(std::memory_order_acquire) == false) { + t->state->counter[id].store(counter, std::memory_order_release); + + int key = rnd.Uniform(kNumKeys); + char keybuf[20]; + snprintf(keybuf, sizeof(keybuf), "%016d", key); + + if (rnd.OneIn(2)) { + // Write values of the form . + // into each of the CFs + // We add some padding for force compactions. + int unique_id = rnd.Uniform(1000000); + + // Half of the time directly use WriteBatch. Half of the time use + // WriteBatchWithIndex. + if (rnd.OneIn(2)) { + WriteBatch batch; + for (int cf = 0; cf < kColumnFamilies; ++cf) { + snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id, + static_cast(counter), cf, unique_id); + batch.Put(t->state->test->handles_[cf], Slice(keybuf), Slice(valbuf)); + } + ASSERT_OK(db->Write(WriteOptions(), &batch)); + } else { + WriteBatchWithIndex batch(db->GetOptions().comparator); + for (int cf = 0; cf < kColumnFamilies; ++cf) { + snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id, + static_cast(counter), cf, unique_id); + batch.Put(t->state->test->handles_[cf], Slice(keybuf), Slice(valbuf)); + } + ASSERT_OK(db->Write(WriteOptions(), batch.GetWriteBatch())); + } + } else { + // Read a value and verify that it matches the pattern written above + // and that writes to all column families were atomic (unique_id is the + // same) + std::vector keys(kColumnFamilies, Slice(keybuf)); + std::vector values; + std::vector statuses; + if (!t->multiget_batched) { + statuses = db->MultiGet(ReadOptions(), t->state->test->handles_, keys, + &values); + } else { + std::vector pin_values(keys.size()); + statuses.resize(keys.size()); + const Snapshot* snapshot = db->GetSnapshot(); + ReadOptions ro; + ro.snapshot = snapshot; + for (int cf = 0; cf < kColumnFamilies; ++cf) { + db->MultiGet(ro, t->state->test->handles_[cf], 1, &keys[cf], + &pin_values[cf], &statuses[cf]); + } + db->ReleaseSnapshot(snapshot); + values.resize(keys.size()); + for (int cf = 0; cf < kColumnFamilies; ++cf) { + if (statuses[cf].ok()) { + values[cf].assign(pin_values[cf].data(), pin_values[cf].size()); + } + } + } + Status s = statuses[0]; + // all statuses have to be the same + for (size_t i = 1; i < statuses.size(); ++i) { + // they are either both ok or both not-found + ASSERT_TRUE((s.ok() && statuses[i].ok()) || + (s.IsNotFound() && statuses[i].IsNotFound())); + } + if (s.IsNotFound()) { + // Key has not yet been written + } else { + // Check that the writer thread counter is >= the counter in the value + ASSERT_OK(s); + int unique_id = -1; + for (int i = 0; i < kColumnFamilies; ++i) { + int k, w, c, cf, u; + ASSERT_EQ(5, sscanf(values[i].c_str(), "%d.%d.%d.%d.%d", &k, &w, &c, + &cf, &u)) + << values[i]; + ASSERT_EQ(k, key); + ASSERT_GE(w, 0); + ASSERT_LT(w, kNumThreads); + ASSERT_LE(c, t->state->counter[w].load(std::memory_order_acquire)); + ASSERT_EQ(cf, i); + if (i == 0) { + unique_id = u; + } else { + // this checks that updates across column families happened + // atomically -- all unique ids are the same + ASSERT_EQ(u, unique_id); + } + } + } + } + counter++; + } + t->state->thread_done[id].store(true, std::memory_order_release); + fprintf(stderr, "... stopping thread %d after %d ops\n", id, int(counter)); +} + +} // namespace + +class MultiThreadedDBTest + : public DBTest, + public ::testing::WithParamInterface> { + public: + void SetUp() override { + std::tie(option_config_, multiget_batched_) = GetParam(); + } + + static std::vector GenerateOptionConfigs() { + std::vector optionConfigs; + for (int optionConfig = kDefault; optionConfig < kEnd; ++optionConfig) { + optionConfigs.push_back(optionConfig); + } + return optionConfigs; + } + + bool multiget_batched_; +}; + +TEST_P(MultiThreadedDBTest, MultiThreaded) { + if (option_config_ == kPipelinedWrite) return; + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; + Options options = CurrentOptions(options_override); + std::vector cfs; + for (int i = 1; i < kColumnFamilies; ++i) { + cfs.push_back(ToString(i)); + } + Reopen(options); + CreateAndReopenWithCF(cfs, options); + // Initialize state + MTState mt; + mt.test = this; + mt.stop.store(false, std::memory_order_release); + for (int id = 0; id < kNumThreads; id++) { + mt.counter[id].store(0, std::memory_order_release); + mt.thread_done[id].store(false, std::memory_order_release); + } + + // Start threads + MTThread thread[kNumThreads]; + for (int id = 0; id < kNumThreads; id++) { + thread[id].state = &mt; + thread[id].id = id; + thread[id].multiget_batched = multiget_batched_; + env_->StartThread(MTThreadBody, &thread[id]); + } + + // Let them run for a while + env_->SleepForMicroseconds(kTestSeconds * 1000000); + + // Stop the threads and wait for them to finish + mt.stop.store(true, std::memory_order_release); + for (int id = 0; id < kNumThreads; id++) { + while (mt.thread_done[id].load(std::memory_order_acquire) == false) { + env_->SleepForMicroseconds(100000); + } + } +} + +INSTANTIATE_TEST_CASE_P( + MultiThreaded, MultiThreadedDBTest, + ::testing::Combine( + ::testing::ValuesIn(MultiThreadedDBTest::GenerateOptionConfigs()), + ::testing::Bool())); +#endif // ROCKSDB_LITE + +// Group commit test: +#if !defined(TRAVIS) && !defined(OS_WIN) +// Disable this test temporarily on Travis and appveyor as it fails +// intermittently. Github issue: #4151 +namespace { + +static const int kGCNumThreads = 4; +static const int kGCNumKeys = 1000; + +struct GCThread { + DB* db; + int id; + std::atomic done; +}; + +static void GCThreadBody(void* arg) { + GCThread* t = reinterpret_cast(arg); + int id = t->id; + DB* db = t->db; + WriteOptions wo; + + for (int i = 0; i < kGCNumKeys; ++i) { + std::string kv(ToString(i + id * kGCNumKeys)); + ASSERT_OK(db->Put(wo, kv, kv)); + } + t->done = true; +} + +} // namespace + +TEST_F(DBTest, GroupCommitTest) { + do { + Options options = CurrentOptions(); + options.env = env_; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + Reopen(options); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"WriteThread::JoinBatchGroup:BeganWaiting", + "DBImpl::WriteImpl:BeforeLeaderEnters"}, + {"WriteThread::AwaitState:BlockingWaiting", + "WriteThread::EnterAsBatchGroupLeader:End"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Start threads + GCThread thread[kGCNumThreads]; + for (int id = 0; id < kGCNumThreads; id++) { + thread[id].id = id; + thread[id].db = db_; + thread[id].done = false; + env_->StartThread(GCThreadBody, &thread[id]); + } + env_->WaitForJoin(); + + ASSERT_GT(TestGetTickerCount(options, WRITE_DONE_BY_OTHER), 0); + + std::vector expected_db; + for (int i = 0; i < kGCNumThreads * kGCNumKeys; ++i) { + expected_db.push_back(ToString(i)); + } + std::sort(expected_db.begin(), expected_db.end()); + + Iterator* itr = db_->NewIterator(ReadOptions()); + itr->SeekToFirst(); + for (auto x : expected_db) { + ASSERT_TRUE(itr->Valid()); + ASSERT_EQ(itr->key().ToString(), x); + ASSERT_EQ(itr->value().ToString(), x); + itr->Next(); + } + ASSERT_TRUE(!itr->Valid()); + delete itr; + + HistogramData hist_data; + options.statistics->histogramData(DB_WRITE, &hist_data); + ASSERT_GT(hist_data.average, 0.0); + } while (ChangeOptions(kSkipNoSeekToLast)); +} +#endif // TRAVIS + +namespace { +typedef std::map KVMap; +} + +class ModelDB : public DB { + public: + class ModelSnapshot : public Snapshot { + public: + KVMap map_; + + SequenceNumber GetSequenceNumber() const override { + // no need to call this + assert(false); + return 0; + } + }; + + explicit ModelDB(const Options& options) : options_(options) {} + using DB::Put; + Status Put(const WriteOptions& o, ColumnFamilyHandle* cf, const Slice& k, + const Slice& v) override { + WriteBatch batch; + batch.Put(cf, k, v); + return Write(o, &batch); + } + using DB::Close; + Status Close() override { return Status::OK(); } + using DB::Delete; + Status Delete(const WriteOptions& o, ColumnFamilyHandle* cf, + const Slice& key) override { + WriteBatch batch; + batch.Delete(cf, key); + return Write(o, &batch); + } + using DB::SingleDelete; + Status SingleDelete(const WriteOptions& o, ColumnFamilyHandle* cf, + const Slice& key) override { + WriteBatch batch; + batch.SingleDelete(cf, key); + return Write(o, &batch); + } + using DB::Merge; + Status Merge(const WriteOptions& o, ColumnFamilyHandle* cf, const Slice& k, + const Slice& v) override { + WriteBatch batch; + batch.Merge(cf, k, v); + return Write(o, &batch); + } + using DB::Get; + Status Get(const ReadOptions& /*options*/, ColumnFamilyHandle* /*cf*/, + const Slice& key, PinnableSlice* /*value*/) override { + return Status::NotSupported(key); + } + + using DB::GetMergeOperands; + virtual Status GetMergeOperands( + const ReadOptions& /*options*/, ColumnFamilyHandle* /*column_family*/, + const Slice& key, PinnableSlice* /*slice*/, + GetMergeOperandsOptions* /*merge_operands_options*/, + int* /*number_of_operands*/) override { + return Status::NotSupported(key); + } + + using DB::MultiGet; + std::vector MultiGet( + const ReadOptions& /*options*/, + const std::vector& /*column_family*/, + const std::vector& keys, + std::vector* /*values*/) override { + std::vector s(keys.size(), + Status::NotSupported("Not implemented.")); + return s; + } + +#ifndef ROCKSDB_LITE + using DB::IngestExternalFile; + Status IngestExternalFile( + ColumnFamilyHandle* /*column_family*/, + const std::vector& /*external_files*/, + const IngestExternalFileOptions& /*options*/) override { + return Status::NotSupported("Not implemented."); + } + + using DB::IngestExternalFiles; + Status IngestExternalFiles( + const std::vector& /*args*/) override { + return Status::NotSupported("Not implemented"); + } + + using DB::CreateColumnFamilyWithImport; + virtual Status CreateColumnFamilyWithImport( + const ColumnFamilyOptions& /*options*/, + const std::string& /*column_family_name*/, + const ImportColumnFamilyOptions& /*import_options*/, + const ExportImportFilesMetaData& /*metadata*/, + ColumnFamilyHandle** /*handle*/) override { + return Status::NotSupported("Not implemented."); + } + + using DB::VerifyChecksum; + Status VerifyChecksum(const ReadOptions&) override { + return Status::NotSupported("Not implemented."); + } + + using DB::GetPropertiesOfAllTables; + Status GetPropertiesOfAllTables( + ColumnFamilyHandle* /*column_family*/, + TablePropertiesCollection* /*props*/) override { + return Status(); + } + + Status GetPropertiesOfTablesInRange( + ColumnFamilyHandle* /*column_family*/, const Range* /*range*/, + std::size_t /*n*/, TablePropertiesCollection* /*props*/) override { + return Status(); + } +#endif // ROCKSDB_LITE + + using DB::KeyMayExist; + bool KeyMayExist(const ReadOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/, + std::string* /*value*/, + bool* value_found = nullptr) override { + if (value_found != nullptr) { + *value_found = false; + } + return true; // Not Supported directly + } + using DB::NewIterator; + Iterator* NewIterator(const ReadOptions& options, + ColumnFamilyHandle* /*column_family*/) override { + if (options.snapshot == nullptr) { + KVMap* saved = new KVMap; + *saved = map_; + return new ModelIter(saved, true); + } else { + const KVMap* snapshot_state = + &(reinterpret_cast(options.snapshot)->map_); + return new ModelIter(snapshot_state, false); + } + } + Status NewIterators(const ReadOptions& /*options*/, + const std::vector& /*column_family*/, + std::vector* /*iterators*/) override { + return Status::NotSupported("Not supported yet"); + } + const Snapshot* GetSnapshot() override { + ModelSnapshot* snapshot = new ModelSnapshot; + snapshot->map_ = map_; + return snapshot; + } + + void ReleaseSnapshot(const Snapshot* snapshot) override { + delete reinterpret_cast(snapshot); + } + + Status Write(const WriteOptions& /*options*/, WriteBatch* batch) override { + class Handler : public WriteBatch::Handler { + public: + KVMap* map_; + void Put(const Slice& key, const Slice& value) override { + (*map_)[key.ToString()] = value.ToString(); + } + void Merge(const Slice& /*key*/, const Slice& /*value*/) override { + // ignore merge for now + // (*map_)[key.ToString()] = value.ToString(); + } + void Delete(const Slice& key) override { map_->erase(key.ToString()); } + }; + Handler handler; + handler.map_ = &map_; + return batch->Iterate(&handler); + } + + using DB::GetProperty; + bool GetProperty(ColumnFamilyHandle* /*column_family*/, + const Slice& /*property*/, std::string* /*value*/) override { + return false; + } + using DB::GetIntProperty; + bool GetIntProperty(ColumnFamilyHandle* /*column_family*/, + const Slice& /*property*/, uint64_t* /*value*/) override { + return false; + } + using DB::GetMapProperty; + bool GetMapProperty(ColumnFamilyHandle* /*column_family*/, + const Slice& /*property*/, + std::map* /*value*/) override { + return false; + } + using DB::GetAggregatedIntProperty; + bool GetAggregatedIntProperty(const Slice& /*property*/, + uint64_t* /*value*/) override { + return false; + } + using DB::GetApproximateSizes; + Status GetApproximateSizes(const SizeApproximationOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Range* /*range*/, int n, + uint64_t* sizes) override { + for (int i = 0; i < n; i++) { + sizes[i] = 0; + } + return Status::OK(); + } + using DB::GetApproximateMemTableStats; + void GetApproximateMemTableStats(ColumnFamilyHandle* /*column_family*/, + const Range& /*range*/, + uint64_t* const count, + uint64_t* const size) override { + *count = 0; + *size = 0; + } + using DB::CompactRange; + Status CompactRange(const CompactRangeOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice* /*start*/, const Slice* /*end*/) override { + return Status::NotSupported("Not supported operation."); + } + + Status SetDBOptions( + const std::unordered_map& /*new_options*/) + override { + return Status::NotSupported("Not supported operation."); + } + + using DB::CompactFiles; + Status CompactFiles( + const CompactionOptions& /*compact_options*/, + ColumnFamilyHandle* /*column_family*/, + const std::vector& /*input_file_names*/, + const int /*output_level*/, const int /*output_path_id*/ = -1, + std::vector* const /*output_file_names*/ = nullptr, + CompactionJobInfo* /*compaction_job_info*/ = nullptr) override { + return Status::NotSupported("Not supported operation."); + } + + Status PauseBackgroundWork() override { + return Status::NotSupported("Not supported operation."); + } + + Status ContinueBackgroundWork() override { + return Status::NotSupported("Not supported operation."); + } + + Status EnableAutoCompaction( + const std::vector& /*column_family_handles*/) + override { + return Status::NotSupported("Not supported operation."); + } + + void EnableManualCompaction() override { return; } + + void DisableManualCompaction() override { return; } + + using DB::NumberLevels; + int NumberLevels(ColumnFamilyHandle* /*column_family*/) override { return 1; } + + using DB::MaxMemCompactionLevel; + int MaxMemCompactionLevel(ColumnFamilyHandle* /*column_family*/) override { + return 1; + } + + using DB::Level0StopWriteTrigger; + int Level0StopWriteTrigger(ColumnFamilyHandle* /*column_family*/) override { + return -1; + } + + const std::string& GetName() const override { return name_; } + + Env* GetEnv() const override { return nullptr; } + + using DB::GetOptions; + Options GetOptions(ColumnFamilyHandle* /*column_family*/) const override { + return options_; + } + + using DB::GetDBOptions; + DBOptions GetDBOptions() const override { return options_; } + + using DB::Flush; + Status Flush(const ROCKSDB_NAMESPACE::FlushOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/) override { + Status ret; + return ret; + } + Status Flush( + const ROCKSDB_NAMESPACE::FlushOptions& /*options*/, + const std::vector& /*column_families*/) override { + return Status::OK(); + } + + Status SyncWAL() override { return Status::OK(); } + +#ifndef ROCKSDB_LITE + Status DisableFileDeletions() override { return Status::OK(); } + + Status EnableFileDeletions(bool /*force*/) override { return Status::OK(); } + Status GetLiveFiles(std::vector&, uint64_t* /*size*/, + bool /*flush_memtable*/ = true) override { + return Status::OK(); + } + + Status GetSortedWalFiles(VectorLogPtr& /*files*/) override { + return Status::OK(); + } + + Status GetCurrentWalFile( + std::unique_ptr* /*current_log_file*/) override { + return Status::OK(); + } + + virtual Status GetCreationTimeOfOldestFile( + uint64_t* /*creation_time*/) override { + return Status::NotSupported(); + } + + Status DeleteFile(std::string /*name*/) override { return Status::OK(); } + + Status GetUpdatesSince( + ROCKSDB_NAMESPACE::SequenceNumber, + std::unique_ptr*, + const TransactionLogIterator::ReadOptions& /*read_options*/ = + TransactionLogIterator::ReadOptions()) override { + return Status::NotSupported("Not supported in Model DB"); + } + + void GetColumnFamilyMetaData(ColumnFamilyHandle* /*column_family*/, + ColumnFamilyMetaData* /*metadata*/) override {} +#endif // ROCKSDB_LITE + + Status GetDbIdentity(std::string& /*identity*/) const override { + return Status::OK(); + } + + SequenceNumber GetLatestSequenceNumber() const override { return 0; } + + bool SetPreserveDeletesSequenceNumber(SequenceNumber /*seqnum*/) override { + return true; + } + + ColumnFamilyHandle* DefaultColumnFamily() const override { return nullptr; } + + private: + class ModelIter : public Iterator { + public: + ModelIter(const KVMap* map, bool owned) + : map_(map), owned_(owned), iter_(map_->end()) {} + ~ModelIter() override { + if (owned_) delete map_; + } + bool Valid() const override { return iter_ != map_->end(); } + void SeekToFirst() override { iter_ = map_->begin(); } + void SeekToLast() override { + if (map_->empty()) { + iter_ = map_->end(); + } else { + iter_ = map_->find(map_->rbegin()->first); + } + } + void Seek(const Slice& k) override { + iter_ = map_->lower_bound(k.ToString()); + } + void SeekForPrev(const Slice& k) override { + iter_ = map_->upper_bound(k.ToString()); + Prev(); + } + void Next() override { ++iter_; } + void Prev() override { + if (iter_ == map_->begin()) { + iter_ = map_->end(); + return; + } + --iter_; + } + + Slice key() const override { return iter_->first; } + Slice value() const override { return iter_->second; } + Status status() const override { return Status::OK(); } + + private: + const KVMap* const map_; + const bool owned_; // Do we own map_ + KVMap::const_iterator iter_; + }; + const Options options_; + KVMap map_; + std::string name_ = ""; +}; + +#ifndef ROCKSDB_VALGRIND_RUN +static std::string RandomKey(Random* rnd, int minimum = 0) { + int len; + do { + len = (rnd->OneIn(3) + ? 1 // Short sometimes to encourage collisions + : (rnd->OneIn(100) ? rnd->Skewed(10) : rnd->Uniform(10))); + } while (len < minimum); + return test::RandomKey(rnd, len); +} + +static bool CompareIterators(int step, DB* model, DB* db, + const Snapshot* model_snap, + const Snapshot* db_snap) { + ReadOptions options; + options.snapshot = model_snap; + Iterator* miter = model->NewIterator(options); + options.snapshot = db_snap; + Iterator* dbiter = db->NewIterator(options); + bool ok = true; + int count = 0; + for (miter->SeekToFirst(), dbiter->SeekToFirst(); + ok && miter->Valid() && dbiter->Valid(); miter->Next(), dbiter->Next()) { + count++; + if (miter->key().compare(dbiter->key()) != 0) { + fprintf(stderr, "step %d: Key mismatch: '%s' vs. '%s'\n", step, + EscapeString(miter->key()).c_str(), + EscapeString(dbiter->key()).c_str()); + ok = false; + break; + } + + if (miter->value().compare(dbiter->value()) != 0) { + fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n", + step, EscapeString(miter->key()).c_str(), + EscapeString(miter->value()).c_str(), + EscapeString(miter->value()).c_str()); + ok = false; + } + } + + if (ok) { + if (miter->Valid() != dbiter->Valid()) { + fprintf(stderr, "step %d: Mismatch at end of iterators: %d vs. %d\n", + step, miter->Valid(), dbiter->Valid()); + ok = false; + } + } + delete miter; + delete dbiter; + return ok; +} + +class DBTestRandomized : public DBTest, + public ::testing::WithParamInterface { + public: + void SetUp() override { option_config_ = GetParam(); } + + static std::vector GenerateOptionConfigs() { + std::vector option_configs; + // skip cuckoo hash as it does not support snapshot. + for (int option_config = kDefault; option_config < kEnd; ++option_config) { + if (!ShouldSkipOptions(option_config, + kSkipDeletesFilterFirst | kSkipNoSeekToLast)) { + option_configs.push_back(option_config); + } + } + option_configs.push_back(kBlockBasedTableWithIndexRestartInterval); + return option_configs; + } +}; + +INSTANTIATE_TEST_CASE_P( + DBTestRandomized, DBTestRandomized, + ::testing::ValuesIn(DBTestRandomized::GenerateOptionConfigs())); + +TEST_P(DBTestRandomized, Randomized) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; + Options options = CurrentOptions(options_override); + DestroyAndReopen(options); + + Random rnd(test::RandomSeed() + GetParam()); + ModelDB model(options); + const int N = 10000; + const Snapshot* model_snap = nullptr; + const Snapshot* db_snap = nullptr; + std::string k, v; + for (int step = 0; step < N; step++) { + // TODO(sanjay): Test Get() works + int p = rnd.Uniform(100); + int minimum = 0; + if (option_config_ == kHashSkipList || option_config_ == kHashLinkList || + option_config_ == kPlainTableFirstBytePrefix || + option_config_ == kBlockBasedTableWithWholeKeyHashIndex || + option_config_ == kBlockBasedTableWithPrefixHashIndex) { + minimum = 1; + } + if (p < 45) { // Put + k = RandomKey(&rnd, minimum); + v = RandomString(&rnd, + rnd.OneIn(20) ? 100 + rnd.Uniform(100) : rnd.Uniform(8)); + ASSERT_OK(model.Put(WriteOptions(), k, v)); + ASSERT_OK(db_->Put(WriteOptions(), k, v)); + } else if (p < 90) { // Delete + k = RandomKey(&rnd, minimum); + ASSERT_OK(model.Delete(WriteOptions(), k)); + ASSERT_OK(db_->Delete(WriteOptions(), k)); + } else { // Multi-element batch + WriteBatch b; + const int num = rnd.Uniform(8); + for (int i = 0; i < num; i++) { + if (i == 0 || !rnd.OneIn(10)) { + k = RandomKey(&rnd, minimum); + } else { + // Periodically re-use the same key from the previous iter, so + // we have multiple entries in the write batch for the same key + } + if (rnd.OneIn(2)) { + v = RandomString(&rnd, rnd.Uniform(10)); + b.Put(k, v); + } else { + b.Delete(k); + } + } + ASSERT_OK(model.Write(WriteOptions(), &b)); + ASSERT_OK(db_->Write(WriteOptions(), &b)); + } + + if ((step % 100) == 0) { + // For DB instances that use the hash index + block-based table, the + // iterator will be invalid right when seeking a non-existent key, right + // than return a key that is close to it. + if (option_config_ != kBlockBasedTableWithWholeKeyHashIndex && + option_config_ != kBlockBasedTableWithPrefixHashIndex) { + ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr)); + ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap)); + } + + // Save a snapshot from each DB this time that we'll use next + // time we compare things, to make sure the current state is + // preserved with the snapshot + if (model_snap != nullptr) model.ReleaseSnapshot(model_snap); + if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap); + + Reopen(options); + ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr)); + + model_snap = model.GetSnapshot(); + db_snap = db_->GetSnapshot(); + } + } + if (model_snap != nullptr) model.ReleaseSnapshot(model_snap); + if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap); +} +#endif // ROCKSDB_VALGRIND_RUN + +TEST_F(DBTest, BlockBasedTablePrefixIndexTest) { + // create a DB with block prefix index + BlockBasedTableOptions table_options; + Options options = CurrentOptions(); + table_options.index_type = BlockBasedTableOptions::kHashSearch; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + + Reopen(options); + ASSERT_OK(Put("k1", "v1")); + Flush(); + ASSERT_OK(Put("k2", "v2")); + + // Reopen it without prefix extractor, make sure everything still works. + // RocksDB should just fall back to the binary index. + table_options.index_type = BlockBasedTableOptions::kBinarySearch; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(); + + Reopen(options); + ASSERT_EQ("v1", Get("k1")); + ASSERT_EQ("v2", Get("k2")); +} + +TEST_F(DBTest, BlockBasedTablePrefixIndexTotalOrderSeek) { + // create a DB with block prefix index + BlockBasedTableOptions table_options; + Options options = CurrentOptions(); + options.max_open_files = 10; + table_options.index_type = BlockBasedTableOptions::kHashSearch; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + + // RocksDB sanitize max open files to at least 20. Modify it back. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) { + int* max_open_files = static_cast(arg); + *max_open_files = 11; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Reopen(options); + ASSERT_OK(Put("k1", "v1")); + Flush(); + + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 1; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + // Force evict tables + dbfull()->TEST_table_cache()->SetCapacity(0); + // Make table cache to keep one entry. + dbfull()->TEST_table_cache()->SetCapacity(1); + + ReadOptions read_options; + read_options.total_order_seek = true; + { + std::unique_ptr iter(db_->NewIterator(read_options)); + iter->Seek("k1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("k1", iter->key().ToString()); + } + + // After total order seek, prefix index should still be used. + read_options.total_order_seek = false; + { + std::unique_ptr iter(db_->NewIterator(read_options)); + iter->Seek("k1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("k1", iter->key().ToString()); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTest, ChecksumTest) { + BlockBasedTableOptions table_options; + Options options = CurrentOptions(); + + table_options.checksum = kCRC32c; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + ASSERT_OK(Put("a", "b")); + ASSERT_OK(Put("c", "d")); + ASSERT_OK(Flush()); // table with crc checksum + + table_options.checksum = kxxHash; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + ASSERT_OK(Put("e", "f")); + ASSERT_OK(Put("g", "h")); + ASSERT_OK(Flush()); // table with xxhash checksum + + table_options.checksum = kCRC32c; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + ASSERT_EQ("b", Get("a")); + ASSERT_EQ("d", Get("c")); + ASSERT_EQ("f", Get("e")); + ASSERT_EQ("h", Get("g")); + + table_options.checksum = kCRC32c; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + ASSERT_EQ("b", Get("a")); + ASSERT_EQ("d", Get("c")); + ASSERT_EQ("f", Get("e")); + ASSERT_EQ("h", Get("g")); +} + +#ifndef ROCKSDB_LITE +TEST_P(DBTestWithParam, FIFOCompactionTest) { + for (int iter = 0; iter < 2; ++iter) { + // first iteration -- auto compaction + // second iteration -- manual compaction + Options options; + options.compaction_style = kCompactionStyleFIFO; + options.write_buffer_size = 100 << 10; // 100KB + options.arena_block_size = 4096; + options.compaction_options_fifo.max_table_files_size = 500 << 10; // 500KB + options.compression = kNoCompression; + options.create_if_missing = true; + options.max_subcompactions = max_subcompactions_; + if (iter == 1) { + options.disable_auto_compactions = true; + } + options = CurrentOptions(options); + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 110; ++j) { + ASSERT_OK(Put(ToString(i * 100 + j), RandomString(&rnd, 980))); + } + // flush should happen here + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + if (iter == 0) { + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } else { + CompactRangeOptions cro; + cro.exclusive_manual_compaction = exclusive_manual_compaction_; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + } + // only 5 files should survive + ASSERT_EQ(NumTableFilesAtLevel(0), 5); + for (int i = 0; i < 50; ++i) { + // these keys should be deleted in previous compaction + ASSERT_EQ("NOT_FOUND", Get(ToString(i))); + } + } +} + +TEST_F(DBTest, FIFOCompactionTestWithCompaction) { + Options options; + options.compaction_style = kCompactionStyleFIFO; + options.write_buffer_size = 20 << 10; // 20K + options.arena_block_size = 4096; + options.compaction_options_fifo.max_table_files_size = 1500 << 10; // 1MB + options.compaction_options_fifo.allow_compaction = true; + options.level0_file_num_compaction_trigger = 6; + options.compression = kNoCompression; + options.create_if_missing = true; + options = CurrentOptions(options); + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < 60; i++) { + // Generate and flush a file about 20KB. + for (int j = 0; j < 20; j++) { + ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980))); + } + Flush(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + // It should be compacted to 10 files. + ASSERT_EQ(NumTableFilesAtLevel(0), 10); + + for (int i = 0; i < 60; i++) { + // Generate and flush a file about 20KB. + for (int j = 0; j < 20; j++) { + ASSERT_OK(Put(ToString(i * 20 + j + 2000), RandomString(&rnd, 980))); + } + Flush(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + + // It should be compacted to no more than 20 files. + ASSERT_GT(NumTableFilesAtLevel(0), 10); + ASSERT_LT(NumTableFilesAtLevel(0), 18); + // Size limit is still guaranteed. + ASSERT_LE(SizeAtLevel(0), + options.compaction_options_fifo.max_table_files_size); +} + +TEST_F(DBTest, FIFOCompactionStyleWithCompactionAndDelete) { + Options options; + options.compaction_style = kCompactionStyleFIFO; + options.write_buffer_size = 20 << 10; // 20K + options.arena_block_size = 4096; + options.compaction_options_fifo.max_table_files_size = 1500 << 10; // 1MB + options.compaction_options_fifo.allow_compaction = true; + options.level0_file_num_compaction_trigger = 3; + options.compression = kNoCompression; + options.create_if_missing = true; + options = CurrentOptions(options); + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < 3; i++) { + // Each file contains a different key which will be dropped later. + ASSERT_OK(Put("a" + ToString(i), RandomString(&rnd, 500))); + ASSERT_OK(Put("key" + ToString(i), "")); + ASSERT_OK(Put("z" + ToString(i), RandomString(&rnd, 500))); + Flush(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + ASSERT_EQ(NumTableFilesAtLevel(0), 1); + for (int i = 0; i < 3; i++) { + ASSERT_EQ("", Get("key" + ToString(i))); + } + for (int i = 0; i < 3; i++) { + // Each file contains a different key which will be dropped later. + ASSERT_OK(Put("a" + ToString(i), RandomString(&rnd, 500))); + ASSERT_OK(Delete("key" + ToString(i))); + ASSERT_OK(Put("z" + ToString(i), RandomString(&rnd, 500))); + Flush(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + ASSERT_EQ(NumTableFilesAtLevel(0), 2); + for (int i = 0; i < 3; i++) { + ASSERT_EQ("NOT_FOUND", Get("key" + ToString(i))); + } +} + +// Check that FIFO-with-TTL is not supported with max_open_files != -1. +TEST_F(DBTest, FIFOCompactionWithTTLAndMaxOpenFilesTest) { + Options options; + options.compaction_style = kCompactionStyleFIFO; + options.create_if_missing = true; + options.ttl = 600; // seconds + + // TTL is now supported with max_open_files != -1. + options.max_open_files = 100; + options = CurrentOptions(options); + ASSERT_OK(TryReopen(options)); + + options.max_open_files = -1; + ASSERT_OK(TryReopen(options)); +} + +// Check that FIFO-with-TTL is supported only with BlockBasedTableFactory. +TEST_F(DBTest, FIFOCompactionWithTTLAndVariousTableFormatsTest) { + Options options; + options.compaction_style = kCompactionStyleFIFO; + options.create_if_missing = true; + options.ttl = 600; // seconds + + options = CurrentOptions(options); + options.table_factory.reset(NewBlockBasedTableFactory()); + ASSERT_OK(TryReopen(options)); + + Destroy(options); + options.table_factory.reset(NewPlainTableFactory()); + ASSERT_TRUE(TryReopen(options).IsNotSupported()); + + Destroy(options); + options.table_factory.reset(NewAdaptiveTableFactory()); + ASSERT_TRUE(TryReopen(options).IsNotSupported()); +} + +TEST_F(DBTest, FIFOCompactionWithTTLTest) { + Options options; + options.compaction_style = kCompactionStyleFIFO; + options.write_buffer_size = 10 << 10; // 10KB + options.arena_block_size = 4096; + options.compression = kNoCompression; + options.create_if_missing = true; + env_->time_elapse_only_sleep_ = false; + options.env = env_; + + // Test to make sure that all files with expired ttl are deleted on next + // manual compaction. + { + env_->addon_time_.store(0); + options.compaction_options_fifo.max_table_files_size = 150 << 10; // 150KB + options.compaction_options_fifo.allow_compaction = false; + options.ttl = 1 * 60 * 60 ; // 1 hour + options = CurrentOptions(options); + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < 10; i++) { + // Generate and flush a file about 10KB. + for (int j = 0; j < 10; j++) { + ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980))); + } + Flush(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + ASSERT_EQ(NumTableFilesAtLevel(0), 10); + + // Sleep for 2 hours -- which is much greater than TTL. + // Note: Couldn't use SleepForMicroseconds because it takes an int instead + // of uint64_t. Hence used addon_time_ directly. + // env_->SleepForMicroseconds(2 * 60 * 60 * 1000 * 1000); + env_->addon_time_.fetch_add(2 * 60 * 60); + + // Since no flushes and compactions have run, the db should still be in + // the same state even after considerable time has passed. + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(NumTableFilesAtLevel(0), 10); + + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + } + + // Test to make sure that all files with expired ttl are deleted on next + // automatic compaction. + { + options.compaction_options_fifo.max_table_files_size = 150 << 10; // 150KB + options.compaction_options_fifo.allow_compaction = false; + options.ttl = 1 * 60 * 60; // 1 hour + options = CurrentOptions(options); + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < 10; i++) { + // Generate and flush a file about 10KB. + for (int j = 0; j < 10; j++) { + ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980))); + } + Flush(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + ASSERT_EQ(NumTableFilesAtLevel(0), 10); + + // Sleep for 2 hours -- which is much greater than TTL. + env_->addon_time_.fetch_add(2 * 60 * 60); + // Just to make sure that we are in the same state even after sleeping. + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(NumTableFilesAtLevel(0), 10); + + // Create 1 more file to trigger TTL compaction. The old files are dropped. + for (int i = 0; i < 1; i++) { + for (int j = 0; j < 10; j++) { + ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980))); + } + Flush(); + } + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // Only the new 10 files remain. + ASSERT_EQ(NumTableFilesAtLevel(0), 1); + ASSERT_LE(SizeAtLevel(0), + options.compaction_options_fifo.max_table_files_size); + } + + // Test that shows the fall back to size-based FIFO compaction if TTL-based + // deletion doesn't move the total size to be less than max_table_files_size. + { + options.write_buffer_size = 10 << 10; // 10KB + options.compaction_options_fifo.max_table_files_size = 150 << 10; // 150KB + options.compaction_options_fifo.allow_compaction = false; + options.ttl = 1 * 60 * 60; // 1 hour + options = CurrentOptions(options); + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < 3; i++) { + // Generate and flush a file about 10KB. + for (int j = 0; j < 10; j++) { + ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980))); + } + Flush(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + ASSERT_EQ(NumTableFilesAtLevel(0), 3); + + // Sleep for 2 hours -- which is much greater than TTL. + env_->addon_time_.fetch_add(2 * 60 * 60); + // Just to make sure that we are in the same state even after sleeping. + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(NumTableFilesAtLevel(0), 3); + + for (int i = 0; i < 5; i++) { + for (int j = 0; j < 140; j++) { + ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980))); + } + Flush(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + // Size limit is still guaranteed. + ASSERT_LE(SizeAtLevel(0), + options.compaction_options_fifo.max_table_files_size); + } + + // Test with TTL + Intra-L0 compactions. + { + options.compaction_options_fifo.max_table_files_size = 150 << 10; // 150KB + options.compaction_options_fifo.allow_compaction = true; + options.ttl = 1 * 60 * 60; // 1 hour + options.level0_file_num_compaction_trigger = 6; + options = CurrentOptions(options); + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < 10; i++) { + // Generate and flush a file about 10KB. + for (int j = 0; j < 10; j++) { + ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980))); + } + Flush(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + // With Intra-L0 compaction, out of 10 files, 6 files will be compacted to 1 + // (due to level0_file_num_compaction_trigger = 6). + // So total files = 1 + remaining 4 = 5. + ASSERT_EQ(NumTableFilesAtLevel(0), 5); + + // Sleep for 2 hours -- which is much greater than TTL. + env_->addon_time_.fetch_add(2 * 60 * 60); + // Just to make sure that we are in the same state even after sleeping. + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(NumTableFilesAtLevel(0), 5); + + // Create 10 more files. The old 5 files are dropped as their ttl expired. + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 10; j++) { + ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980))); + } + Flush(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + ASSERT_EQ(NumTableFilesAtLevel(0), 5); + ASSERT_LE(SizeAtLevel(0), + options.compaction_options_fifo.max_table_files_size); + } + + // Test with large TTL + Intra-L0 compactions. + // Files dropped based on size, as ttl doesn't kick in. + { + options.write_buffer_size = 20 << 10; // 20K + options.compaction_options_fifo.max_table_files_size = 1500 << 10; // 1.5MB + options.compaction_options_fifo.allow_compaction = true; + options.ttl = 1 * 60 * 60; // 1 hour + options.level0_file_num_compaction_trigger = 6; + options = CurrentOptions(options); + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < 60; i++) { + // Generate and flush a file about 20KB. + for (int j = 0; j < 20; j++) { + ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980))); + } + Flush(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + // It should be compacted to 10 files. + ASSERT_EQ(NumTableFilesAtLevel(0), 10); + + for (int i = 0; i < 60; i++) { + // Generate and flush a file about 20KB. + for (int j = 0; j < 20; j++) { + ASSERT_OK(Put(ToString(i * 20 + j + 2000), RandomString(&rnd, 980))); + } + Flush(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + + // It should be compacted to no more than 20 files. + ASSERT_GT(NumTableFilesAtLevel(0), 10); + ASSERT_LT(NumTableFilesAtLevel(0), 18); + // Size limit is still guaranteed. + ASSERT_LE(SizeAtLevel(0), + options.compaction_options_fifo.max_table_files_size); + } +} +#endif // ROCKSDB_LITE + +#ifndef ROCKSDB_LITE +/* + * This test is not reliable enough as it heavily depends on disk behavior. + * Disable as it is flaky. + */ +TEST_F(DBTest, DISABLED_RateLimitingTest) { + Options options = CurrentOptions(); + options.write_buffer_size = 1 << 20; // 1MB + options.level0_file_num_compaction_trigger = 2; + options.target_file_size_base = 1 << 20; // 1MB + options.max_bytes_for_level_base = 4 << 20; // 4MB + options.max_bytes_for_level_multiplier = 4; + options.compression = kNoCompression; + options.create_if_missing = true; + options.env = env_; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.IncreaseParallelism(4); + DestroyAndReopen(options); + + WriteOptions wo; + wo.disableWAL = true; + + // # no rate limiting + Random rnd(301); + uint64_t start = env_->NowMicros(); + // Write ~96M data + for (int64_t i = 0; i < (96 << 10); ++i) { + ASSERT_OK( + Put(RandomString(&rnd, 32), RandomString(&rnd, (1 << 10) + 1), wo)); + } + uint64_t elapsed = env_->NowMicros() - start; + double raw_rate = env_->bytes_written_ * 1000000.0 / elapsed; + uint64_t rate_limiter_drains = + TestGetTickerCount(options, NUMBER_RATE_LIMITER_DRAINS); + ASSERT_EQ(0, rate_limiter_drains); + Close(); + + // # rate limiting with 0.7 x threshold + options.rate_limiter.reset( + NewGenericRateLimiter(static_cast(0.7 * raw_rate))); + env_->bytes_written_ = 0; + DestroyAndReopen(options); + + start = env_->NowMicros(); + // Write ~96M data + for (int64_t i = 0; i < (96 << 10); ++i) { + ASSERT_OK( + Put(RandomString(&rnd, 32), RandomString(&rnd, (1 << 10) + 1), wo)); + } + rate_limiter_drains = + TestGetTickerCount(options, NUMBER_RATE_LIMITER_DRAINS) - + rate_limiter_drains; + elapsed = env_->NowMicros() - start; + Close(); + ASSERT_EQ(options.rate_limiter->GetTotalBytesThrough(), env_->bytes_written_); + // Most intervals should've been drained (interval time is 100ms, elapsed is + // micros) + ASSERT_GT(rate_limiter_drains, 0); + ASSERT_LE(rate_limiter_drains, elapsed / 100000 + 1); + double ratio = env_->bytes_written_ * 1000000 / elapsed / raw_rate; + fprintf(stderr, "write rate ratio = %.2lf, expected 0.7\n", ratio); + ASSERT_TRUE(ratio < 0.8); + + // # rate limiting with half of the raw_rate + options.rate_limiter.reset( + NewGenericRateLimiter(static_cast(raw_rate / 2))); + env_->bytes_written_ = 0; + DestroyAndReopen(options); + + start = env_->NowMicros(); + // Write ~96M data + for (int64_t i = 0; i < (96 << 10); ++i) { + ASSERT_OK( + Put(RandomString(&rnd, 32), RandomString(&rnd, (1 << 10) + 1), wo)); + } + elapsed = env_->NowMicros() - start; + rate_limiter_drains = + TestGetTickerCount(options, NUMBER_RATE_LIMITER_DRAINS) - + rate_limiter_drains; + Close(); + ASSERT_EQ(options.rate_limiter->GetTotalBytesThrough(), env_->bytes_written_); + // Most intervals should've been drained (interval time is 100ms, elapsed is + // micros) + ASSERT_GT(rate_limiter_drains, elapsed / 100000 / 2); + ASSERT_LE(rate_limiter_drains, elapsed / 100000 + 1); + ratio = env_->bytes_written_ * 1000000 / elapsed / raw_rate; + fprintf(stderr, "write rate ratio = %.2lf, expected 0.5\n", ratio); + ASSERT_LT(ratio, 0.6); +} + +TEST_F(DBTest, TableOptionsSanitizeTest) { + Options options = CurrentOptions(); + options.create_if_missing = true; + DestroyAndReopen(options); + ASSERT_EQ(db_->GetOptions().allow_mmap_reads, false); + + options.table_factory.reset(new PlainTableFactory()); + options.prefix_extractor.reset(NewNoopTransform()); + Destroy(options); + ASSERT_TRUE(!TryReopen(options).IsNotSupported()); + + // Test for check of prefix_extractor when hash index is used for + // block-based table + BlockBasedTableOptions to; + to.index_type = BlockBasedTableOptions::kHashSearch; + options = CurrentOptions(); + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(to)); + ASSERT_TRUE(TryReopen(options).IsInvalidArgument()); + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + ASSERT_OK(TryReopen(options)); +} + +TEST_F(DBTest, ConcurrentMemtableNotSupported) { + Options options = CurrentOptions(); + options.allow_concurrent_memtable_write = true; + options.soft_pending_compaction_bytes_limit = 0; + options.hard_pending_compaction_bytes_limit = 100; + options.create_if_missing = true; + + DestroyDB(dbname_, options); + options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true, 4)); + ASSERT_NOK(TryReopen(options)); + + options.memtable_factory.reset(new SkipListFactory); + ASSERT_OK(TryReopen(options)); + + ColumnFamilyOptions cf_options(options); + cf_options.memtable_factory.reset( + NewHashLinkListRepFactory(4, 0, 3, true, 4)); + ColumnFamilyHandle* handle; + ASSERT_NOK(db_->CreateColumnFamily(cf_options, "name", &handle)); +} + +#endif // ROCKSDB_LITE + +TEST_F(DBTest, SanitizeNumThreads) { + for (int attempt = 0; attempt < 2; attempt++) { + const size_t kTotalTasks = 8; + test::SleepingBackgroundTask sleeping_tasks[kTotalTasks]; + + Options options = CurrentOptions(); + if (attempt == 0) { + options.max_background_compactions = 3; + options.max_background_flushes = 2; + } + options.create_if_missing = true; + DestroyAndReopen(options); + + for (size_t i = 0; i < kTotalTasks; i++) { + // Insert 5 tasks to low priority queue and 5 tasks to high priority queue + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_tasks[i], + (i < 4) ? Env::Priority::LOW : Env::Priority::HIGH); + } + + // Wait until 10s for they are scheduled. + for (int i = 0; i < 10000; i++) { + if (options.env->GetThreadPoolQueueLen(Env::Priority::LOW) <= 1 && + options.env->GetThreadPoolQueueLen(Env::Priority::HIGH) <= 2) { + break; + } + env_->SleepForMicroseconds(1000); + } + + // pool size 3, total task 4. Queue size should be 1. + ASSERT_EQ(1U, options.env->GetThreadPoolQueueLen(Env::Priority::LOW)); + // pool size 2, total task 4. Queue size should be 2. + ASSERT_EQ(2U, options.env->GetThreadPoolQueueLen(Env::Priority::HIGH)); + + for (size_t i = 0; i < kTotalTasks; i++) { + sleeping_tasks[i].WakeUp(); + sleeping_tasks[i].WaitUntilDone(); + } + + ASSERT_OK(Put("abc", "def")); + ASSERT_EQ("def", Get("abc")); + Flush(); + ASSERT_EQ("def", Get("abc")); + } +} + +TEST_F(DBTest, WriteSingleThreadEntry) { + std::vector threads; + dbfull()->TEST_LockMutex(); + auto w = dbfull()->TEST_BeginWrite(); + threads.emplace_back([&] { Put("a", "b"); }); + env_->SleepForMicroseconds(10000); + threads.emplace_back([&] { Flush(); }); + env_->SleepForMicroseconds(10000); + dbfull()->TEST_UnlockMutex(); + dbfull()->TEST_LockMutex(); + dbfull()->TEST_EndWrite(w); + dbfull()->TEST_UnlockMutex(); + + for (auto& t : threads) { + t.join(); + } +} + +TEST_F(DBTest, ConcurrentFlushWAL) { + const size_t cnt = 100; + Options options; + WriteOptions wopt; + ReadOptions ropt; + for (bool two_write_queues : {false, true}) { + for (bool manual_wal_flush : {false, true}) { + options.two_write_queues = two_write_queues; + options.manual_wal_flush = manual_wal_flush; + options.create_if_missing = true; + DestroyAndReopen(options); + std::vector threads; + threads.emplace_back([&] { + for (size_t i = 0; i < cnt; i++) { + auto istr = ToString(i); + db_->Put(wopt, db_->DefaultColumnFamily(), "a" + istr, "b" + istr); + } + }); + if (two_write_queues) { + threads.emplace_back([&] { + for (size_t i = cnt; i < 2 * cnt; i++) { + auto istr = ToString(i); + WriteBatch batch; + batch.Put("a" + istr, "b" + istr); + dbfull()->WriteImpl(wopt, &batch, nullptr, nullptr, 0, true); + } + }); + } + threads.emplace_back([&] { + for (size_t i = 0; i < cnt * 100; i++) { // FlushWAL is faster than Put + db_->FlushWAL(false); + } + }); + for (auto& t : threads) { + t.join(); + } + options.create_if_missing = false; + // Recover from the wal and make sure that it is not corrupted + Reopen(options); + for (size_t i = 0; i < cnt; i++) { + PinnableSlice pval; + auto istr = ToString(i); + ASSERT_OK( + db_->Get(ropt, db_->DefaultColumnFamily(), "a" + istr, &pval)); + ASSERT_TRUE(pval == ("b" + istr)); + } + } + } +} + +#ifndef ROCKSDB_LITE +TEST_F(DBTest, DynamicMemtableOptions) { + const uint64_t k64KB = 1 << 16; + const uint64_t k128KB = 1 << 17; + const uint64_t k5KB = 5 * 1024; + Options options; + options.env = env_; + options.create_if_missing = true; + options.compression = kNoCompression; + options.max_background_compactions = 1; + options.write_buffer_size = k64KB; + options.arena_block_size = 16 * 1024; + options.max_write_buffer_number = 2; + // Don't trigger compact/slowdown/stop + options.level0_file_num_compaction_trigger = 1024; + options.level0_slowdown_writes_trigger = 1024; + options.level0_stop_writes_trigger = 1024; + DestroyAndReopen(options); + + auto gen_l0_kb = [this](int size) { + const int kNumPutsBeforeWaitForFlush = 64; + Random rnd(301); + for (int i = 0; i < size; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); + + // The following condition prevents a race condition between flush jobs + // acquiring work and this thread filling up multiple memtables. Without + // this, the flush might produce less files than expected because + // multiple memtables are flushed into a single L0 file. This race + // condition affects assertion (A). + if (i % kNumPutsBeforeWaitForFlush == kNumPutsBeforeWaitForFlush - 1) { + dbfull()->TEST_WaitForFlushMemTable(); + } + } + dbfull()->TEST_WaitForFlushMemTable(); + }; + + // Test write_buffer_size + gen_l0_kb(64); + ASSERT_EQ(NumTableFilesAtLevel(0), 1); + ASSERT_LT(SizeAtLevel(0), k64KB + k5KB); + ASSERT_GT(SizeAtLevel(0), k64KB - k5KB * 2); + + // Clean up L0 + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + + // Increase buffer size + ASSERT_OK(dbfull()->SetOptions({ + {"write_buffer_size", "131072"}, + })); + + // The existing memtable inflated 64KB->128KB when we invoked SetOptions(). + // Write 192KB, we should have a 128KB L0 file and a memtable with 64KB data. + gen_l0_kb(192); + ASSERT_EQ(NumTableFilesAtLevel(0), 1); // (A) + ASSERT_LT(SizeAtLevel(0), k128KB + 2 * k5KB); + ASSERT_GT(SizeAtLevel(0), k128KB - 4 * k5KB); + + // Decrease buffer size below current usage + ASSERT_OK(dbfull()->SetOptions({ + {"write_buffer_size", "65536"}, + })); + // The existing memtable became eligible for flush when we reduced its + // capacity to 64KB. Two keys need to be added to trigger flush: first causes + // memtable to be marked full, second schedules the flush. Then we should have + // a 128KB L0 file, a 64KB L0 file, and a memtable with just one key. + gen_l0_kb(2); + ASSERT_EQ(NumTableFilesAtLevel(0), 2); + ASSERT_LT(SizeAtLevel(0), k128KB + k64KB + 2 * k5KB); + ASSERT_GT(SizeAtLevel(0), k128KB + k64KB - 4 * k5KB); + + // Test max_write_buffer_number + // Block compaction thread, which will also block the flushes because + // max_background_flushes == 0, so flushes are getting executed by the + // compaction thread + env_->SetBackgroundThreads(1, Env::LOW); + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + // Start from scratch and disable compaction/flush. Flush can only happen + // during compaction but trigger is pretty high + options.disable_auto_compactions = true; + DestroyAndReopen(options); + env_->SetBackgroundThreads(0, Env::HIGH); + + // Put until writes are stopped, bounded by 256 puts. We should see stop at + // ~128KB + int count = 0; + Random rnd(301); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::DelayWrite:Wait", + [&](void* /*arg*/) { sleeping_task_low.WakeUp(); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + while (!sleeping_task_low.WokenUp() && count < 256) { + ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), WriteOptions())); + count++; + } + ASSERT_GT(static_cast(count), 128 * 0.8); + ASSERT_LT(static_cast(count), 128 * 1.2); + + sleeping_task_low.WaitUntilDone(); + + // Increase + ASSERT_OK(dbfull()->SetOptions({ + {"max_write_buffer_number", "8"}, + })); + // Clean up memtable and L0 + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + + sleeping_task_low.Reset(); + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + count = 0; + while (!sleeping_task_low.WokenUp() && count < 1024) { + ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), WriteOptions())); + count++; + } +// Windows fails this test. Will tune in the future and figure out +// approp number +#ifndef OS_WIN + ASSERT_GT(static_cast(count), 512 * 0.8); + ASSERT_LT(static_cast(count), 512 * 1.2); +#endif + sleeping_task_low.WaitUntilDone(); + + // Decrease + ASSERT_OK(dbfull()->SetOptions({ + {"max_write_buffer_number", "4"}, + })); + // Clean up memtable and L0 + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + + sleeping_task_low.Reset(); + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + + count = 0; + while (!sleeping_task_low.WokenUp() && count < 1024) { + ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), WriteOptions())); + count++; + } +// Windows fails this test. Will tune in the future and figure out +// approp number +#ifndef OS_WIN + ASSERT_GT(static_cast(count), 256 * 0.8); + ASSERT_LT(static_cast(count), 266 * 1.2); +#endif + sleeping_task_low.WaitUntilDone(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} +#endif // ROCKSDB_LITE + +#ifdef ROCKSDB_USING_THREAD_STATUS +namespace { +void VerifyOperationCount(Env* env, ThreadStatus::OperationType op_type, + int expected_count) { + int op_count = 0; + std::vector thread_list; + ASSERT_OK(env->GetThreadList(&thread_list)); + for (auto thread : thread_list) { + if (thread.operation_type == op_type) { + op_count++; + } + } + ASSERT_EQ(op_count, expected_count); +} +} // namespace + +TEST_F(DBTest, GetThreadStatus) { + Options options; + options.env = env_; + options.enable_thread_tracking = true; + TryReopen(options); + + std::vector thread_list; + Status s = env_->GetThreadList(&thread_list); + + for (int i = 0; i < 2; ++i) { + // repeat the test with differet number of high / low priority threads + const int kTestCount = 3; + const unsigned int kHighPriCounts[kTestCount] = {3, 2, 5}; + const unsigned int kLowPriCounts[kTestCount] = {10, 15, 3}; + const unsigned int kBottomPriCounts[kTestCount] = {2, 1, 4}; + for (int test = 0; test < kTestCount; ++test) { + // Change the number of threads in high / low priority pool. + env_->SetBackgroundThreads(kHighPriCounts[test], Env::HIGH); + env_->SetBackgroundThreads(kLowPriCounts[test], Env::LOW); + env_->SetBackgroundThreads(kBottomPriCounts[test], Env::BOTTOM); + // Wait to ensure the all threads has been registered + unsigned int thread_type_counts[ThreadStatus::NUM_THREAD_TYPES]; + // TODO(ajkr): it'd be better if SetBackgroundThreads returned only after + // all threads have been registered. + // Try up to 60 seconds. + for (int num_try = 0; num_try < 60000; num_try++) { + env_->SleepForMicroseconds(1000); + thread_list.clear(); + s = env_->GetThreadList(&thread_list); + ASSERT_OK(s); + memset(thread_type_counts, 0, sizeof(thread_type_counts)); + for (auto thread : thread_list) { + ASSERT_LT(thread.thread_type, ThreadStatus::NUM_THREAD_TYPES); + thread_type_counts[thread.thread_type]++; + } + if (thread_type_counts[ThreadStatus::HIGH_PRIORITY] == + kHighPriCounts[test] && + thread_type_counts[ThreadStatus::LOW_PRIORITY] == + kLowPriCounts[test] && + thread_type_counts[ThreadStatus::BOTTOM_PRIORITY] == + kBottomPriCounts[test]) { + break; + } + } + // Verify the number of high-priority threads + ASSERT_EQ(thread_type_counts[ThreadStatus::HIGH_PRIORITY], + kHighPriCounts[test]); + // Verify the number of low-priority threads + ASSERT_EQ(thread_type_counts[ThreadStatus::LOW_PRIORITY], + kLowPriCounts[test]); + // Verify the number of bottom-priority threads + ASSERT_EQ(thread_type_counts[ThreadStatus::BOTTOM_PRIORITY], + kBottomPriCounts[test]); + } + if (i == 0) { + // repeat the test with multiple column families + CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options); + env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_, + true); + } + } + db_->DropColumnFamily(handles_[2]); + delete handles_[2]; + handles_.erase(handles_.begin() + 2); + env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_, + true); + Close(); + env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_, + true); +} + +TEST_F(DBTest, DisableThreadStatus) { + Options options; + options.env = env_; + options.enable_thread_tracking = false; + TryReopen(options); + CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options); + // Verify non of the column family info exists + env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_, + false); +} + +TEST_F(DBTest, ThreadStatusFlush) { + Options options; + options.env = env_; + options.write_buffer_size = 100000; // Small write buffer + options.enable_thread_tracking = true; + options = CurrentOptions(options); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"FlushJob::FlushJob()", "DBTest::ThreadStatusFlush:1"}, + {"DBTest::ThreadStatusFlush:2", "FlushJob::WriteLevel0Table"}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + CreateAndReopenWithCF({"pikachu"}, options); + VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 0); + + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_EQ("v1", Get(1, "foo")); + VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 0); + + uint64_t num_running_flushes = 0; + db_->GetIntProperty(DB::Properties::kNumRunningFlushes, &num_running_flushes); + ASSERT_EQ(num_running_flushes, 0); + + Put(1, "k1", std::string(100000, 'x')); // Fill memtable + Put(1, "k2", std::string(100000, 'y')); // Trigger flush + + // The first sync point is to make sure there's one flush job + // running when we perform VerifyOperationCount(). + TEST_SYNC_POINT("DBTest::ThreadStatusFlush:1"); + VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 1); + db_->GetIntProperty(DB::Properties::kNumRunningFlushes, &num_running_flushes); + ASSERT_EQ(num_running_flushes, 1); + // This second sync point is to ensure the flush job will not + // be completed until we already perform VerifyOperationCount(). + TEST_SYNC_POINT("DBTest::ThreadStatusFlush:2"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_P(DBTestWithParam, ThreadStatusSingleCompaction) { + const int kTestKeySize = 16; + const int kTestValueSize = 984; + const int kEntrySize = kTestKeySize + kTestValueSize; + const int kEntriesPerBuffer = 100; + Options options; + options.create_if_missing = true; + options.write_buffer_size = kEntrySize * kEntriesPerBuffer; + options.compaction_style = kCompactionStyleLevel; + options.target_file_size_base = options.write_buffer_size; + options.max_bytes_for_level_base = options.target_file_size_base * 2; + options.max_bytes_for_level_multiplier = 2; + options.compression = kNoCompression; + options = CurrentOptions(options); + options.env = env_; + options.enable_thread_tracking = true; + const int kNumL0Files = 4; + options.level0_file_num_compaction_trigger = kNumL0Files; + options.max_subcompactions = max_subcompactions_; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"DBTest::ThreadStatusSingleCompaction:0", "DBImpl::BGWorkCompaction"}, + {"CompactionJob::Run():Start", "DBTest::ThreadStatusSingleCompaction:1"}, + {"DBTest::ThreadStatusSingleCompaction:2", "CompactionJob::Run():End"}, + }); + for (int tests = 0; tests < 2; ++tests) { + DestroyAndReopen(options); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + // The Put Phase. + for (int file = 0; file < kNumL0Files; ++file) { + for (int key = 0; key < kEntriesPerBuffer; ++key) { + ASSERT_OK(Put(ToString(key + file * kEntriesPerBuffer), + RandomString(&rnd, kTestValueSize))); + } + Flush(); + } + // This makes sure a compaction won't be scheduled until + // we have done with the above Put Phase. + uint64_t num_running_compactions = 0; + db_->GetIntProperty(DB::Properties::kNumRunningCompactions, + &num_running_compactions); + ASSERT_EQ(num_running_compactions, 0); + TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:0"); + ASSERT_GE(NumTableFilesAtLevel(0), + options.level0_file_num_compaction_trigger); + + // This makes sure at least one compaction is running. + TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:1"); + + if (options.enable_thread_tracking) { + // expecting one single L0 to L1 compaction + VerifyOperationCount(env_, ThreadStatus::OP_COMPACTION, 1); + } else { + // If thread tracking is not enabled, compaction count should be 0. + VerifyOperationCount(env_, ThreadStatus::OP_COMPACTION, 0); + } + db_->GetIntProperty(DB::Properties::kNumRunningCompactions, + &num_running_compactions); + ASSERT_EQ(num_running_compactions, 1); + // TODO(yhchiang): adding assert to verify each compaction stage. + TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:2"); + + // repeat the test with disabling thread tracking. + options.enable_thread_tracking = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } +} + +TEST_P(DBTestWithParam, PreShutdownManualCompaction) { + Options options = CurrentOptions(); + options.max_subcompactions = max_subcompactions_; + CreateAndReopenWithCF({"pikachu"}, options); + + // iter - 0 with 7 levels + // iter - 1 with 3 levels + for (int iter = 0; iter < 2; ++iter) { + MakeTables(3, "p", "q", 1); + ASSERT_EQ("1,1,1", FilesPerLevel(1)); + + // Compaction range falls before files + Compact(1, "", "c"); + ASSERT_EQ("1,1,1", FilesPerLevel(1)); + + // Compaction range falls after files + Compact(1, "r", "z"); + ASSERT_EQ("1,1,1", FilesPerLevel(1)); + + // Compaction range overlaps files + Compact(1, "p1", "p9"); + ASSERT_EQ("0,0,1", FilesPerLevel(1)); + + // Populate a different range + MakeTables(3, "c", "e", 1); + ASSERT_EQ("1,1,2", FilesPerLevel(1)); + + // Compact just the new range + Compact(1, "b", "f"); + ASSERT_EQ("0,0,2", FilesPerLevel(1)); + + // Compact all + MakeTables(1, "a", "z", 1); + ASSERT_EQ("1,0,2", FilesPerLevel(1)); + CancelAllBackgroundWork(db_); + db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr); + ASSERT_EQ("1,0,2", FilesPerLevel(1)); + + if (iter == 0) { + options = CurrentOptions(); + options.num_levels = 3; + options.create_if_missing = true; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + } + } +} + +TEST_F(DBTest, PreShutdownFlush) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_OK(Put(1, "key", "value")); + CancelAllBackgroundWork(db_); + Status s = + db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr); + ASSERT_TRUE(s.IsShutdownInProgress()); +} + +TEST_P(DBTestWithParam, PreShutdownMultipleCompaction) { + const int kTestKeySize = 16; + const int kTestValueSize = 984; + const int kEntrySize = kTestKeySize + kTestValueSize; + const int kEntriesPerBuffer = 40; + const int kNumL0Files = 4; + + const int kHighPriCount = 3; + const int kLowPriCount = 5; + env_->SetBackgroundThreads(kHighPriCount, Env::HIGH); + env_->SetBackgroundThreads(kLowPriCount, Env::LOW); + + Options options; + options.create_if_missing = true; + options.write_buffer_size = kEntrySize * kEntriesPerBuffer; + options.compaction_style = kCompactionStyleLevel; + options.target_file_size_base = options.write_buffer_size; + options.max_bytes_for_level_base = + options.target_file_size_base * kNumL0Files; + options.compression = kNoCompression; + options = CurrentOptions(options); + options.env = env_; + options.enable_thread_tracking = true; + options.level0_file_num_compaction_trigger = kNumL0Files; + options.max_bytes_for_level_multiplier = 2; + options.max_background_compactions = kLowPriCount; + options.level0_stop_writes_trigger = 1 << 10; + options.level0_slowdown_writes_trigger = 1 << 10; + options.max_subcompactions = max_subcompactions_; + + TryReopen(options); + Random rnd(301); + + std::vector thread_list; + // Delay both flush and compaction + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"FlushJob::FlushJob()", "CompactionJob::Run():Start"}, + {"CompactionJob::Run():Start", + "DBTest::PreShutdownMultipleCompaction:Preshutdown"}, + {"CompactionJob::Run():Start", + "DBTest::PreShutdownMultipleCompaction:VerifyCompaction"}, + {"DBTest::PreShutdownMultipleCompaction:Preshutdown", + "CompactionJob::Run():End"}, + {"CompactionJob::Run():End", + "DBTest::PreShutdownMultipleCompaction:VerifyPreshutdown"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Make rocksdb busy + int key = 0; + // check how many threads are doing compaction using GetThreadList + int operation_count[ThreadStatus::NUM_OP_TYPES] = {0}; + for (int file = 0; file < 16 * kNumL0Files; ++file) { + for (int k = 0; k < kEntriesPerBuffer; ++k) { + ASSERT_OK(Put(ToString(key++), RandomString(&rnd, kTestValueSize))); + } + + Status s = env_->GetThreadList(&thread_list); + for (auto thread : thread_list) { + operation_count[thread.operation_type]++; + } + + // Speed up the test + if (operation_count[ThreadStatus::OP_FLUSH] > 1 && + operation_count[ThreadStatus::OP_COMPACTION] > + 0.6 * options.max_background_compactions) { + break; + } + if (file == 15 * kNumL0Files) { + TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:Preshutdown"); + } + } + + TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:Preshutdown"); + ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1); + CancelAllBackgroundWork(db_); + TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:VerifyPreshutdown"); + dbfull()->TEST_WaitForCompact(); + // Record the number of compactions at a time. + for (int i = 0; i < ThreadStatus::NUM_OP_TYPES; ++i) { + operation_count[i] = 0; + } + Status s = env_->GetThreadList(&thread_list); + for (auto thread : thread_list) { + operation_count[thread.operation_type]++; + } + ASSERT_EQ(operation_count[ThreadStatus::OP_COMPACTION], 0); +} + +TEST_P(DBTestWithParam, PreShutdownCompactionMiddle) { + const int kTestKeySize = 16; + const int kTestValueSize = 984; + const int kEntrySize = kTestKeySize + kTestValueSize; + const int kEntriesPerBuffer = 40; + const int kNumL0Files = 4; + + const int kHighPriCount = 3; + const int kLowPriCount = 5; + env_->SetBackgroundThreads(kHighPriCount, Env::HIGH); + env_->SetBackgroundThreads(kLowPriCount, Env::LOW); + + Options options; + options.create_if_missing = true; + options.write_buffer_size = kEntrySize * kEntriesPerBuffer; + options.compaction_style = kCompactionStyleLevel; + options.target_file_size_base = options.write_buffer_size; + options.max_bytes_for_level_base = + options.target_file_size_base * kNumL0Files; + options.compression = kNoCompression; + options = CurrentOptions(options); + options.env = env_; + options.enable_thread_tracking = true; + options.level0_file_num_compaction_trigger = kNumL0Files; + options.max_bytes_for_level_multiplier = 2; + options.max_background_compactions = kLowPriCount; + options.level0_stop_writes_trigger = 1 << 10; + options.level0_slowdown_writes_trigger = 1 << 10; + options.max_subcompactions = max_subcompactions_; + + TryReopen(options); + Random rnd(301); + + std::vector thread_list; + // Delay both flush and compaction + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBTest::PreShutdownCompactionMiddle:Preshutdown", + "CompactionJob::Run():Inprogress"}, + {"CompactionJob::Run():Start", + "DBTest::PreShutdownCompactionMiddle:VerifyCompaction"}, + {"CompactionJob::Run():Inprogress", "CompactionJob::Run():End"}, + {"CompactionJob::Run():End", + "DBTest::PreShutdownCompactionMiddle:VerifyPreshutdown"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Make rocksdb busy + int key = 0; + // check how many threads are doing compaction using GetThreadList + int operation_count[ThreadStatus::NUM_OP_TYPES] = {0}; + for (int file = 0; file < 16 * kNumL0Files; ++file) { + for (int k = 0; k < kEntriesPerBuffer; ++k) { + ASSERT_OK(Put(ToString(key++), RandomString(&rnd, kTestValueSize))); + } + + Status s = env_->GetThreadList(&thread_list); + for (auto thread : thread_list) { + operation_count[thread.operation_type]++; + } + + // Speed up the test + if (operation_count[ThreadStatus::OP_FLUSH] > 1 && + operation_count[ThreadStatus::OP_COMPACTION] > + 0.6 * options.max_background_compactions) { + break; + } + if (file == 15 * kNumL0Files) { + TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:VerifyCompaction"); + } + } + + ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1); + CancelAllBackgroundWork(db_); + TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:Preshutdown"); + TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:VerifyPreshutdown"); + dbfull()->TEST_WaitForCompact(); + // Record the number of compactions at a time. + for (int i = 0; i < ThreadStatus::NUM_OP_TYPES; ++i) { + operation_count[i] = 0; + } + Status s = env_->GetThreadList(&thread_list); + for (auto thread : thread_list) { + operation_count[thread.operation_type]++; + } + ASSERT_EQ(operation_count[ThreadStatus::OP_COMPACTION], 0); +} + +#endif // ROCKSDB_USING_THREAD_STATUS + +#ifndef ROCKSDB_LITE +TEST_F(DBTest, FlushOnDestroy) { + WriteOptions wo; + wo.disableWAL = true; + ASSERT_OK(Put("foo", "v1", wo)); + CancelAllBackgroundWork(db_); +} + +TEST_F(DBTest, DynamicLevelCompressionPerLevel) { + if (!Snappy_Supported()) { + return; + } + const int kNKeys = 120; + int keys[kNKeys]; + for (int i = 0; i < kNKeys; i++) { + keys[i] = i; + } + std::random_shuffle(std::begin(keys), std::end(keys)); + + Random rnd(301); + Options options; + options.create_if_missing = true; + options.db_write_buffer_size = 20480; + options.write_buffer_size = 20480; + options.max_write_buffer_number = 2; + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 2; + options.level0_stop_writes_trigger = 2; + options.target_file_size_base = 20480; + options.level_compaction_dynamic_level_bytes = true; + options.max_bytes_for_level_base = 102400; + options.max_bytes_for_level_multiplier = 4; + options.max_background_compactions = 1; + options.num_levels = 5; + + options.compression_per_level.resize(3); + options.compression_per_level[0] = kNoCompression; + options.compression_per_level[1] = kNoCompression; + options.compression_per_level[2] = kSnappyCompression; + + OnFileDeletionListener* listener = new OnFileDeletionListener(); + options.listeners.emplace_back(listener); + + DestroyAndReopen(options); + + // Insert more than 80K. L4 should be base level. Neither L0 nor L4 should + // be compressed, so total data size should be more than 80K. + for (int i = 0; i < 20; i++) { + ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000))); + } + Flush(); + dbfull()->TEST_WaitForCompact(); + + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + ASSERT_EQ(NumTableFilesAtLevel(2), 0); + ASSERT_EQ(NumTableFilesAtLevel(3), 0); + // Assuming each files' metadata is at least 50 bytes/ + ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(4), 20U * 4000U + 50U * 4); + + // Insert 400KB. Some data will be compressed + for (int i = 21; i < 120; i++) { + ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000))); + } + Flush(); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + ASSERT_EQ(NumTableFilesAtLevel(2), 0); + ASSERT_LT(SizeAtLevel(0) + SizeAtLevel(3) + SizeAtLevel(4), + 120U * 4000U + 50U * 24); + // Make sure data in files in L3 is not compacted by removing all files + // in L4 and calculate number of rows + ASSERT_OK(dbfull()->SetOptions({ + {"disable_auto_compactions", "true"}, + })); + ColumnFamilyMetaData cf_meta; + db_->GetColumnFamilyMetaData(&cf_meta); + for (auto file : cf_meta.levels[4].files) { + listener->SetExpectedFileName(dbname_ + file.name); + ASSERT_OK(dbfull()->DeleteFile(file.name)); + } + listener->VerifyMatchedCount(cf_meta.levels[4].files.size()); + + int num_keys = 0; + std::unique_ptr iter(db_->NewIterator(ReadOptions())); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + num_keys++; + } + ASSERT_OK(iter->status()); + ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(3), num_keys * 4000U + num_keys * 10U); +} + +TEST_F(DBTest, DynamicLevelCompressionPerLevel2) { + if (!Snappy_Supported() || !LZ4_Supported() || !Zlib_Supported()) { + return; + } + const int kNKeys = 500; + int keys[kNKeys]; + for (int i = 0; i < kNKeys; i++) { + keys[i] = i; + } + std::random_shuffle(std::begin(keys), std::end(keys)); + + Random rnd(301); + Options options; + options.create_if_missing = true; + options.db_write_buffer_size = 6000000; + options.write_buffer_size = 600000; + options.max_write_buffer_number = 2; + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 2; + options.level0_stop_writes_trigger = 2; + options.soft_pending_compaction_bytes_limit = 1024 * 1024; + options.target_file_size_base = 20; + + options.level_compaction_dynamic_level_bytes = true; + options.max_bytes_for_level_base = 200; + options.max_bytes_for_level_multiplier = 8; + options.max_background_compactions = 1; + options.num_levels = 5; + std::shared_ptr mtf(new mock::MockTableFactory); + options.table_factory = mtf; + + options.compression_per_level.resize(3); + options.compression_per_level[0] = kNoCompression; + options.compression_per_level[1] = kLZ4Compression; + options.compression_per_level[2] = kZlibCompression; + + DestroyAndReopen(options); + // When base level is L4, L4 is LZ4. + std::atomic num_zlib(0); + std::atomic num_lz4(0); + std::atomic num_no(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + if (compaction->output_level() == 4) { + ASSERT_TRUE(compaction->output_compression() == kLZ4Compression); + num_lz4.fetch_add(1); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) { + auto* compression = reinterpret_cast(arg); + ASSERT_TRUE(*compression == kNoCompression); + num_no.fetch_add(1); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + for (int i = 0; i < 100; i++) { + std::string value = RandomString(&rnd, 200); + ASSERT_OK(Put(Key(keys[i]), value)); + if (i % 25 == 24) { + Flush(); + dbfull()->TEST_WaitForCompact(); + } + } + + Flush(); + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + ASSERT_EQ(NumTableFilesAtLevel(2), 0); + ASSERT_EQ(NumTableFilesAtLevel(3), 0); + ASSERT_GT(NumTableFilesAtLevel(4), 0); + ASSERT_GT(num_no.load(), 2); + ASSERT_GT(num_lz4.load(), 0); + int prev_num_files_l4 = NumTableFilesAtLevel(4); + + // After base level turn L4->L3, L3 becomes LZ4 and L4 becomes Zlib + num_lz4.store(0); + num_no.store(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + if (compaction->output_level() == 4 && compaction->start_level() == 3) { + ASSERT_TRUE(compaction->output_compression() == kZlibCompression); + num_zlib.fetch_add(1); + } else { + ASSERT_TRUE(compaction->output_compression() == kLZ4Compression); + num_lz4.fetch_add(1); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) { + auto* compression = reinterpret_cast(arg); + ASSERT_TRUE(*compression == kNoCompression); + num_no.fetch_add(1); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + for (int i = 101; i < 500; i++) { + std::string value = RandomString(&rnd, 200); + ASSERT_OK(Put(Key(keys[i]), value)); + if (i % 100 == 99) { + Flush(); + dbfull()->TEST_WaitForCompact(); + } + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + ASSERT_EQ(NumTableFilesAtLevel(2), 0); + ASSERT_GT(NumTableFilesAtLevel(3), 0); + ASSERT_GT(NumTableFilesAtLevel(4), prev_num_files_l4); + ASSERT_GT(num_no.load(), 2); + ASSERT_GT(num_lz4.load(), 0); + ASSERT_GT(num_zlib.load(), 0); +} + +TEST_F(DBTest, DynamicCompactionOptions) { + // minimum write buffer size is enforced at 64KB + const uint64_t k32KB = 1 << 15; + const uint64_t k64KB = 1 << 16; + const uint64_t k128KB = 1 << 17; + const uint64_t k1MB = 1 << 20; + const uint64_t k4KB = 1 << 12; + Options options; + options.env = env_; + options.create_if_missing = true; + options.compression = kNoCompression; + options.soft_pending_compaction_bytes_limit = 1024 * 1024; + options.write_buffer_size = k64KB; + options.arena_block_size = 4 * k4KB; + options.max_write_buffer_number = 2; + // Compaction related options + options.level0_file_num_compaction_trigger = 3; + options.level0_slowdown_writes_trigger = 4; + options.level0_stop_writes_trigger = 8; + options.target_file_size_base = k64KB; + options.max_compaction_bytes = options.target_file_size_base * 10; + options.target_file_size_multiplier = 1; + options.max_bytes_for_level_base = k128KB; + options.max_bytes_for_level_multiplier = 4; + + // Block flush thread and disable compaction thread + env_->SetBackgroundThreads(1, Env::LOW); + env_->SetBackgroundThreads(1, Env::HIGH); + DestroyAndReopen(options); + + auto gen_l0_kb = [this](int start, int size, int stride) { + Random rnd(301); + for (int i = 0; i < size; i++) { + ASSERT_OK(Put(Key(start + stride * i), RandomString(&rnd, 1024))); + } + dbfull()->TEST_WaitForFlushMemTable(); + }; + + // Write 3 files that have the same key range. + // Since level0_file_num_compaction_trigger is 3, compaction should be + // triggered. The compaction should result in one L1 file + gen_l0_kb(0, 64, 1); + ASSERT_EQ(NumTableFilesAtLevel(0), 1); + gen_l0_kb(0, 64, 1); + ASSERT_EQ(NumTableFilesAtLevel(0), 2); + gen_l0_kb(0, 64, 1); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ("0,1", FilesPerLevel()); + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(1U, metadata.size()); + ASSERT_LE(metadata[0].size, k64KB + k4KB); + ASSERT_GE(metadata[0].size, k64KB - k4KB); + + // Test compaction trigger and target_file_size_base + // Reduce compaction trigger to 2, and reduce L1 file size to 32KB. + // Writing to 64KB L0 files should trigger a compaction. Since these + // 2 L0 files have the same key range, compaction merge them and should + // result in 2 32KB L1 files. + ASSERT_OK(dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "2"}, + {"target_file_size_base", ToString(k32KB)}})); + + gen_l0_kb(0, 64, 1); + ASSERT_EQ("1,1", FilesPerLevel()); + gen_l0_kb(0, 64, 1); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ("0,2", FilesPerLevel()); + metadata.clear(); + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(2U, metadata.size()); + ASSERT_LE(metadata[0].size, k32KB + k4KB); + ASSERT_GE(metadata[0].size, k32KB - k4KB); + ASSERT_LE(metadata[1].size, k32KB + k4KB); + ASSERT_GE(metadata[1].size, k32KB - k4KB); + + // Test max_bytes_for_level_base + // Increase level base size to 256KB and write enough data that will + // fill L1 and L2. L1 size should be around 256KB while L2 size should be + // around 256KB x 4. + ASSERT_OK( + dbfull()->SetOptions({{"max_bytes_for_level_base", ToString(k1MB)}})); + + // writing 96 x 64KB => 6 * 1024KB + // (L1 + L2) = (1 + 4) * 1024KB + for (int i = 0; i < 96; ++i) { + gen_l0_kb(i, 64, 96); + } + dbfull()->TEST_WaitForCompact(); + ASSERT_GT(SizeAtLevel(1), k1MB / 2); + ASSERT_LT(SizeAtLevel(1), k1MB + k1MB / 2); + + // Within (0.5, 1.5) of 4MB. + ASSERT_GT(SizeAtLevel(2), 2 * k1MB); + ASSERT_LT(SizeAtLevel(2), 6 * k1MB); + + // Test max_bytes_for_level_multiplier and + // max_bytes_for_level_base. Now, reduce both mulitplier and level base, + // After filling enough data that can fit in L1 - L3, we should see L1 size + // reduces to 128KB from 256KB which was asserted previously. Same for L2. + ASSERT_OK( + dbfull()->SetOptions({{"max_bytes_for_level_multiplier", "2"}, + {"max_bytes_for_level_base", ToString(k128KB)}})); + + // writing 20 x 64KB = 10 x 128KB + // (L1 + L2 + L3) = (1 + 2 + 4) * 128KB + for (int i = 0; i < 20; ++i) { + gen_l0_kb(i, 64, 32); + } + dbfull()->TEST_WaitForCompact(); + uint64_t total_size = SizeAtLevel(1) + SizeAtLevel(2) + SizeAtLevel(3); + ASSERT_TRUE(total_size < k128KB * 7 * 1.5); + + // Test level0_stop_writes_trigger. + // Clean up memtable and L0. Block compaction threads. If continue to write + // and flush memtables. We should see put stop after 8 memtable flushes + // since level0_stop_writes_trigger = 8 + dbfull()->TEST_FlushMemTable(true, true); + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + // Block compaction + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + sleeping_task_low.WaitUntilSleeping(); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + int count = 0; + Random rnd(301); + WriteOptions wo; + while (count < 64) { + ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), wo)); + dbfull()->TEST_FlushMemTable(true, true); + count++; + if (dbfull()->TEST_write_controler().IsStopped()) { + sleeping_task_low.WakeUp(); + break; + } + } + // Stop trigger = 8 + ASSERT_EQ(count, 8); + // Unblock + sleeping_task_low.WaitUntilDone(); + + // Now reduce level0_stop_writes_trigger to 6. Clear up memtables and L0. + // Block compaction thread again. Perform the put and memtable flushes + // until we see the stop after 6 memtable flushes. + ASSERT_OK(dbfull()->SetOptions({{"level0_stop_writes_trigger", "6"}})); + dbfull()->TEST_FlushMemTable(true); + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + + // Block compaction again + sleeping_task_low.Reset(); + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + sleeping_task_low.WaitUntilSleeping(); + count = 0; + while (count < 64) { + ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), wo)); + dbfull()->TEST_FlushMemTable(true, true); + count++; + if (dbfull()->TEST_write_controler().IsStopped()) { + sleeping_task_low.WakeUp(); + break; + } + } + ASSERT_EQ(count, 6); + // Unblock + sleeping_task_low.WaitUntilDone(); + + // Test disable_auto_compactions + // Compaction thread is unblocked but auto compaction is disabled. Write + // 4 L0 files and compaction should be triggered. If auto compaction is + // disabled, then TEST_WaitForCompact will be waiting for nothing. Number of + // L0 files do not change after the call. + ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "true"}})); + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + + for (int i = 0; i < 4; ++i) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); + // Wait for compaction so that put won't stop + dbfull()->TEST_FlushMemTable(true); + } + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(NumTableFilesAtLevel(0), 4); + + // Enable auto compaction and perform the same test, # of L0 files should be + // reduced after compaction. + ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}})); + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + + for (int i = 0; i < 4; ++i) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); + // Wait for compaction so that put won't stop + dbfull()->TEST_FlushMemTable(true); + } + dbfull()->TEST_WaitForCompact(); + ASSERT_LT(NumTableFilesAtLevel(0), 4); +} + +// Test dynamic FIFO compaction options. +// This test covers just option parsing and makes sure that the options are +// correctly assigned. Also look at DBOptionsTest.SetFIFOCompactionOptions +// test which makes sure that the FIFO compaction funcionality is working +// as expected on dynamically changing the options. +// Even more FIFOCompactionTests are at DBTest.FIFOCompaction* . +TEST_F(DBTest, DynamicFIFOCompactionOptions) { + Options options; + options.ttl = 0; + options.create_if_missing = true; + DestroyAndReopen(options); + + // Initial defaults + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size, + 1024 * 1024 * 1024); + ASSERT_EQ(dbfull()->GetOptions().ttl, 0); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction, + false); + + ASSERT_OK(dbfull()->SetOptions( + {{"compaction_options_fifo", "{max_table_files_size=23;}"}})); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size, + 23); + ASSERT_EQ(dbfull()->GetOptions().ttl, 0); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction, + false); + + ASSERT_OK(dbfull()->SetOptions({{"ttl", "97"}})); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size, + 23); + ASSERT_EQ(dbfull()->GetOptions().ttl, 97); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction, + false); + + ASSERT_OK(dbfull()->SetOptions({{"ttl", "203"}})); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size, + 23); + ASSERT_EQ(dbfull()->GetOptions().ttl, 203); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction, + false); + + ASSERT_OK(dbfull()->SetOptions( + {{"compaction_options_fifo", "{allow_compaction=true;}"}})); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size, + 23); + ASSERT_EQ(dbfull()->GetOptions().ttl, 203); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction, + true); + + ASSERT_OK(dbfull()->SetOptions( + {{"compaction_options_fifo", "{max_table_files_size=31;}"}})); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size, + 31); + ASSERT_EQ(dbfull()->GetOptions().ttl, 203); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction, + true); + + ASSERT_OK(dbfull()->SetOptions( + {{"compaction_options_fifo", + "{max_table_files_size=51;allow_compaction=true;}"}})); + ASSERT_OK(dbfull()->SetOptions({{"ttl", "49"}})); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size, + 51); + ASSERT_EQ(dbfull()->GetOptions().ttl, 49); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction, + true); +} + +TEST_F(DBTest, DynamicUniversalCompactionOptions) { + Options options; + options.create_if_missing = true; + DestroyAndReopen(options); + + // Initial defaults + ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 1U); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.min_merge_width, + 2u); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.max_merge_width, + UINT_MAX); + ASSERT_EQ(dbfull() + ->GetOptions() + .compaction_options_universal.max_size_amplification_percent, + 200u); + ASSERT_EQ(dbfull() + ->GetOptions() + .compaction_options_universal.compression_size_percent, + -1); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.stop_style, + kCompactionStopStyleTotalSize); + ASSERT_EQ( + dbfull()->GetOptions().compaction_options_universal.allow_trivial_move, + false); + + ASSERT_OK(dbfull()->SetOptions( + {{"compaction_options_universal", "{size_ratio=7;}"}})); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 7u); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.min_merge_width, + 2u); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.max_merge_width, + UINT_MAX); + ASSERT_EQ(dbfull() + ->GetOptions() + .compaction_options_universal.max_size_amplification_percent, + 200u); + ASSERT_EQ(dbfull() + ->GetOptions() + .compaction_options_universal.compression_size_percent, + -1); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.stop_style, + kCompactionStopStyleTotalSize); + ASSERT_EQ( + dbfull()->GetOptions().compaction_options_universal.allow_trivial_move, + false); + + ASSERT_OK(dbfull()->SetOptions( + {{"compaction_options_universal", "{min_merge_width=11;}"}})); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 7u); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.min_merge_width, + 11u); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.max_merge_width, + UINT_MAX); + ASSERT_EQ(dbfull() + ->GetOptions() + .compaction_options_universal.max_size_amplification_percent, + 200u); + ASSERT_EQ(dbfull() + ->GetOptions() + .compaction_options_universal.compression_size_percent, + -1); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.stop_style, + kCompactionStopStyleTotalSize); + ASSERT_EQ( + dbfull()->GetOptions().compaction_options_universal.allow_trivial_move, + false); +} +#endif // ROCKSDB_LITE + +TEST_F(DBTest, FileCreationRandomFailure) { + Options options; + options.env = env_; + options.create_if_missing = true; + options.write_buffer_size = 100000; // Small write buffer + options.target_file_size_base = 200000; + options.max_bytes_for_level_base = 1000000; + options.max_bytes_for_level_multiplier = 2; + + DestroyAndReopen(options); + Random rnd(301); + + const int kCDTKeysPerBuffer = 4; + const int kTestSize = kCDTKeysPerBuffer * 4096; + const int kTotalIteration = 100; + // the second half of the test involves in random failure + // of file creation. + const int kRandomFailureTest = kTotalIteration / 2; + std::vector values; + for (int i = 0; i < kTestSize; ++i) { + values.push_back("NOT_FOUND"); + } + for (int j = 0; j < kTotalIteration; ++j) { + if (j == kRandomFailureTest) { + env_->non_writeable_rate_.store(90); + } + for (int k = 0; k < kTestSize; ++k) { + // here we expect some of the Put fails. + std::string value = RandomString(&rnd, 100); + Status s = Put(Key(k), Slice(value)); + if (s.ok()) { + // update the latest successful put + values[k] = value; + } + // But everything before we simulate the failure-test should succeed. + if (j < kRandomFailureTest) { + ASSERT_OK(s); + } + } + } + + // If rocksdb does not do the correct job, internal assert will fail here. + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + + // verify we have the latest successful update + for (int k = 0; k < kTestSize; ++k) { + auto v = Get(Key(k)); + ASSERT_EQ(v, values[k]); + } + + // reopen and reverify we have the latest successful update + env_->non_writeable_rate_.store(0); + Reopen(options); + for (int k = 0; k < kTestSize; ++k) { + auto v = Get(Key(k)); + ASSERT_EQ(v, values[k]); + } +} + +#ifndef ROCKSDB_LITE + +TEST_F(DBTest, DynamicMiscOptions) { + // Test max_sequential_skip_in_iterations + Options options; + options.env = env_; + options.create_if_missing = true; + options.max_sequential_skip_in_iterations = 16; + options.compression = kNoCompression; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + DestroyAndReopen(options); + + auto assert_reseek_count = [this, &options](int key_start, int num_reseek) { + int key0 = key_start; + int key1 = key_start + 1; + int key2 = key_start + 2; + Random rnd(301); + ASSERT_OK(Put(Key(key0), RandomString(&rnd, 8))); + for (int i = 0; i < 10; ++i) { + ASSERT_OK(Put(Key(key1), RandomString(&rnd, 8))); + } + ASSERT_OK(Put(Key(key2), RandomString(&rnd, 8))); + std::unique_ptr iter(db_->NewIterator(ReadOptions())); + iter->Seek(Key(key1)); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Key(key1)), 0); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Key(key2)), 0); + ASSERT_EQ(num_reseek, + TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION)); + }; + // No reseek + assert_reseek_count(100, 0); + + ASSERT_OK(dbfull()->SetOptions({{"max_sequential_skip_in_iterations", "4"}})); + // Clear memtable and make new option effective + dbfull()->TEST_FlushMemTable(true); + // Trigger reseek + assert_reseek_count(200, 1); + + ASSERT_OK( + dbfull()->SetOptions({{"max_sequential_skip_in_iterations", "16"}})); + // Clear memtable and make new option effective + dbfull()->TEST_FlushMemTable(true); + // No reseek + assert_reseek_count(300, 1); + + MutableCFOptions mutable_cf_options; + CreateAndReopenWithCF({"pikachu"}, options); + // Test soft_pending_compaction_bytes_limit, + // hard_pending_compaction_bytes_limit + ASSERT_OK(dbfull()->SetOptions( + handles_[1], {{"soft_pending_compaction_bytes_limit", "200"}, + {"hard_pending_compaction_bytes_limit", "300"}})); + ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1], + &mutable_cf_options)); + ASSERT_EQ(200, mutable_cf_options.soft_pending_compaction_bytes_limit); + ASSERT_EQ(300, mutable_cf_options.hard_pending_compaction_bytes_limit); + // Test report_bg_io_stats + ASSERT_OK( + dbfull()->SetOptions(handles_[1], {{"report_bg_io_stats", "true"}})); + // sanity check + ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1], + &mutable_cf_options)); + ASSERT_TRUE(mutable_cf_options.report_bg_io_stats); + // Test compression + // sanity check + ASSERT_OK(dbfull()->SetOptions({{"compression", "kNoCompression"}})); + ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[0], + &mutable_cf_options)); + ASSERT_EQ(CompressionType::kNoCompression, mutable_cf_options.compression); + + if (Snappy_Supported()) { + ASSERT_OK(dbfull()->SetOptions({{"compression", "kSnappyCompression"}})); + ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[0], + &mutable_cf_options)); + ASSERT_EQ(CompressionType::kSnappyCompression, + mutable_cf_options.compression); + } + + // Test paranoid_file_checks already done in db_block_cache_test + ASSERT_OK( + dbfull()->SetOptions(handles_[1], {{"paranoid_file_checks", "true"}})); + ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1], + &mutable_cf_options)); + ASSERT_TRUE(mutable_cf_options.report_bg_io_stats); +} +#endif // ROCKSDB_LITE + +TEST_F(DBTest, L0L1L2AndUpHitCounter) { + Options options = CurrentOptions(); + options.write_buffer_size = 32 * 1024; + options.target_file_size_base = 32 * 1024; + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 2; + options.level0_stop_writes_trigger = 4; + options.max_bytes_for_level_base = 64 * 1024; + options.max_write_buffer_number = 2; + options.max_background_compactions = 8; + options.max_background_flushes = 8; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + CreateAndReopenWithCF({"mypikachu"}, options); + + int numkeys = 20000; + for (int i = 0; i < numkeys; i++) { + ASSERT_OK(Put(1, Key(i), "val")); + } + ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0)); + ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1)); + ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP)); + + ASSERT_OK(Flush(1)); + dbfull()->TEST_WaitForCompact(); + + for (int i = 0; i < numkeys; i++) { + ASSERT_EQ(Get(1, Key(i)), "val"); + } + + ASSERT_GT(TestGetTickerCount(options, GET_HIT_L0), 100); + ASSERT_GT(TestGetTickerCount(options, GET_HIT_L1), 100); + ASSERT_GT(TestGetTickerCount(options, GET_HIT_L2_AND_UP), 100); + + ASSERT_EQ(numkeys, TestGetTickerCount(options, GET_HIT_L0) + + TestGetTickerCount(options, GET_HIT_L1) + + TestGetTickerCount(options, GET_HIT_L2_AND_UP)); +} + +TEST_F(DBTest, EncodeDecompressedBlockSizeTest) { + // iter 0 -- zlib + // iter 1 -- bzip2 + // iter 2 -- lz4 + // iter 3 -- lz4HC + // iter 4 -- xpress + CompressionType compressions[] = {kZlibCompression, kBZip2Compression, + kLZ4Compression, kLZ4HCCompression, + kXpressCompression}; + for (auto comp : compressions) { + if (!CompressionTypeSupported(comp)) { + continue; + } + // first_table_version 1 -- generate with table_version == 1, read with + // table_version == 2 + // first_table_version 2 -- generate with table_version == 2, read with + // table_version == 1 + for (int first_table_version = 1; first_table_version <= 2; + ++first_table_version) { + BlockBasedTableOptions table_options; + table_options.format_version = first_table_version; + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); + Options options = CurrentOptions(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.create_if_missing = true; + options.compression = comp; + DestroyAndReopen(options); + + int kNumKeysWritten = 1000; + + Random rnd(301); + for (int i = 0; i < kNumKeysWritten; ++i) { + // compressible string + ASSERT_OK(Put(Key(i), RandomString(&rnd, 128) + std::string(128, 'a'))); + } + + table_options.format_version = first_table_version == 1 ? 2 : 1; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + for (int i = 0; i < kNumKeysWritten; ++i) { + auto r = Get(Key(i)); + ASSERT_EQ(r.substr(128), std::string(128, 'a')); + } + } + } +} + +TEST_F(DBTest, CloseSpeedup) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleLevel; + options.write_buffer_size = 110 << 10; // 110KB + options.arena_block_size = 4 << 10; + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 4; + options.max_bytes_for_level_base = 400 * 1024; + options.max_write_buffer_number = 16; + + // Block background threads + env_->SetBackgroundThreads(1, Env::LOW); + env_->SetBackgroundThreads(1, Env::HIGH); + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + test::SleepingBackgroundTask sleeping_task_high; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_task_high, Env::Priority::HIGH); + + std::vector filenames; + env_->GetChildren(dbname_, &filenames); + // Delete archival files. + for (size_t i = 0; i < filenames.size(); ++i) { + env_->DeleteFile(dbname_ + "/" + filenames[i]); + } + env_->DeleteDir(dbname_); + DestroyAndReopen(options); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + env_->SetBackgroundThreads(1, Env::LOW); + env_->SetBackgroundThreads(1, Env::HIGH); + Random rnd(301); + int key_idx = 0; + + // First three 110KB files are not going to level 2 + // After that, (100K, 200K) + for (int num = 0; num < 5; num++) { + GenerateNewFile(&rnd, &key_idx, true); + } + + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + Close(); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + // Unblock background threads + sleeping_task_high.WakeUp(); + sleeping_task_high.WaitUntilDone(); + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); + + Destroy(options); +} + +class DelayedMergeOperator : public MergeOperator { + private: + DBTest* db_test_; + + public: + explicit DelayedMergeOperator(DBTest* d) : db_test_(d) {} + + bool FullMergeV2(const MergeOperationInput& /*merge_in*/, + MergeOperationOutput* merge_out) const override { + db_test_->env_->addon_time_.fetch_add(1000); + merge_out->new_value = ""; + return true; + } + + const char* Name() const override { return "DelayedMergeOperator"; } +}; + +TEST_F(DBTest, MergeTestTime) { + std::string one, two, three; + PutFixed64(&one, 1); + PutFixed64(&two, 2); + PutFixed64(&three, 3); + + // Enable time profiling + SetPerfLevel(kEnableTime); + this->env_->addon_time_.store(0); + this->env_->time_elapse_only_sleep_ = true; + this->env_->no_slowdown_ = true; + Options options = CurrentOptions(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.merge_operator.reset(new DelayedMergeOperator(this)); + DestroyAndReopen(options); + + ASSERT_EQ(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 0); + db_->Put(WriteOptions(), "foo", one); + ASSERT_OK(Flush()); + ASSERT_OK(db_->Merge(WriteOptions(), "foo", two)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->Merge(WriteOptions(), "foo", three)); + ASSERT_OK(Flush()); + + ReadOptions opt; + opt.verify_checksums = true; + opt.snapshot = nullptr; + std::string result; + db_->Get(opt, "foo", &result); + + ASSERT_EQ(1000000, TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME)); + + ReadOptions read_options; + std::unique_ptr iter(db_->NewIterator(read_options)); + int count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + ++count; + } + + ASSERT_EQ(1, count); + ASSERT_EQ(2000000, TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME)); +#ifdef ROCKSDB_USING_THREAD_STATUS + ASSERT_GT(TestGetTickerCount(options, FLUSH_WRITE_BYTES), 0); +#endif // ROCKSDB_USING_THREAD_STATUS + this->env_->time_elapse_only_sleep_ = false; +} + +#ifndef ROCKSDB_LITE +TEST_P(DBTestWithParam, MergeCompactionTimeTest) { + SetPerfLevel(kEnableTime); + Options options = CurrentOptions(); + options.compaction_filter_factory = std::make_shared(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.merge_operator.reset(new DelayedMergeOperator(this)); + options.compaction_style = kCompactionStyleUniversal; + options.max_subcompactions = max_subcompactions_; + DestroyAndReopen(options); + + for (int i = 0; i < 1000; i++) { + ASSERT_OK(db_->Merge(WriteOptions(), "foo", "TEST")); + ASSERT_OK(Flush()); + } + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + + ASSERT_NE(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 0); +} + +TEST_P(DBTestWithParam, FilterCompactionTimeTest) { + Options options = CurrentOptions(); + options.compaction_filter_factory = + std::make_shared(this); + options.disable_auto_compactions = true; + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.statistics->set_stats_level(kExceptTimeForMutex); + options.max_subcompactions = max_subcompactions_; + DestroyAndReopen(options); + + // put some data + for (int table = 0; table < 4; ++table) { + for (int i = 0; i < 10 + table; ++i) { + Put(ToString(table * 100 + i), "val"); + } + Flush(); + } + + CompactRangeOptions cro; + cro.exclusive_manual_compaction = exclusive_manual_compaction_; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ(0U, CountLiveFiles()); + + Reopen(options); + + Iterator* itr = db_->NewIterator(ReadOptions()); + itr->SeekToFirst(); + ASSERT_NE(TestGetTickerCount(options, FILTER_OPERATION_TOTAL_TIME), 0); + delete itr; +} +#endif // ROCKSDB_LITE + +TEST_F(DBTest, TestLogCleanup) { + Options options = CurrentOptions(); + options.write_buffer_size = 64 * 1024; // very small + // only two memtables allowed ==> only two log files + options.max_write_buffer_number = 2; + Reopen(options); + + for (int i = 0; i < 100000; ++i) { + Put(Key(i), "val"); + // only 2 memtables will be alive, so logs_to_free needs to always be below + // 2 + ASSERT_LT(dbfull()->TEST_LogsToFreeSize(), static_cast(3)); + } +} + +#ifndef ROCKSDB_LITE +TEST_F(DBTest, EmptyCompactedDB) { + Options options = CurrentOptions(); + options.max_open_files = -1; + Close(); + ASSERT_OK(ReadOnlyReopen(options)); + Status s = Put("new", "value"); + ASSERT_TRUE(s.IsNotSupported()); + Close(); +} +#endif // ROCKSDB_LITE + +#ifndef ROCKSDB_LITE +TEST_F(DBTest, SuggestCompactRangeTest) { + class CompactionFilterFactoryGetContext : public CompactionFilterFactory { + public: + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { + saved_context = context; + std::unique_ptr empty_filter; + return empty_filter; + } + const char* Name() const override { + return "CompactionFilterFactoryGetContext"; + } + static bool IsManual(CompactionFilterFactory* compaction_filter_factory) { + return reinterpret_cast( + compaction_filter_factory) + ->saved_context.is_manual_compaction; + } + CompactionFilter::Context saved_context; + }; + + Options options = CurrentOptions(); + options.memtable_factory.reset( + new SpecialSkipListFactory(DBTestBase::kNumKeysByGenerateNewRandomFile)); + options.compaction_style = kCompactionStyleLevel; + options.compaction_filter_factory.reset( + new CompactionFilterFactoryGetContext()); + options.write_buffer_size = 200 << 10; + options.arena_block_size = 4 << 10; + options.level0_file_num_compaction_trigger = 4; + options.num_levels = 4; + options.compression = kNoCompression; + options.max_bytes_for_level_base = 450 << 10; + options.target_file_size_base = 98 << 10; + options.max_compaction_bytes = static_cast(1) << 60; // inf + + Reopen(options); + + Random rnd(301); + + for (int num = 0; num < 3; num++) { + GenerateNewRandomFile(&rnd); + } + + GenerateNewRandomFile(&rnd); + ASSERT_EQ("0,4", FilesPerLevel(0)); + ASSERT_TRUE(!CompactionFilterFactoryGetContext::IsManual( + options.compaction_filter_factory.get())); + + GenerateNewRandomFile(&rnd); + ASSERT_EQ("1,4", FilesPerLevel(0)); + + GenerateNewRandomFile(&rnd); + ASSERT_EQ("2,4", FilesPerLevel(0)); + + GenerateNewRandomFile(&rnd); + ASSERT_EQ("3,4", FilesPerLevel(0)); + + GenerateNewRandomFile(&rnd); + ASSERT_EQ("0,4,4", FilesPerLevel(0)); + + GenerateNewRandomFile(&rnd); + ASSERT_EQ("1,4,4", FilesPerLevel(0)); + + GenerateNewRandomFile(&rnd); + ASSERT_EQ("2,4,4", FilesPerLevel(0)); + + GenerateNewRandomFile(&rnd); + ASSERT_EQ("3,4,4", FilesPerLevel(0)); + + GenerateNewRandomFile(&rnd); + ASSERT_EQ("0,4,8", FilesPerLevel(0)); + + GenerateNewRandomFile(&rnd); + ASSERT_EQ("1,4,8", FilesPerLevel(0)); + + // compact it three times + for (int i = 0; i < 3; ++i) { + ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr)); + dbfull()->TEST_WaitForCompact(); + } + + // All files are compacted + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_EQ(0, NumTableFilesAtLevel(1)); + + GenerateNewRandomFile(&rnd); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + + // nonoverlapping with the file on level 0 + Slice start("a"), end("b"); + ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end)); + dbfull()->TEST_WaitForCompact(); + + // should not compact the level 0 file + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + + start = Slice("j"); + end = Slice("m"); + ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end)); + dbfull()->TEST_WaitForCompact(); + ASSERT_TRUE(CompactionFilterFactoryGetContext::IsManual( + options.compaction_filter_factory.get())); + + // now it should compact the level 0 file + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); +} + +TEST_F(DBTest, PromoteL0) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.write_buffer_size = 10 * 1024 * 1024; + DestroyAndReopen(options); + + // non overlapping ranges + std::vector> ranges = { + {81, 160}, {0, 80}, {161, 240}, {241, 320}}; + + int32_t value_size = 10 * 1024; // 10 KB + + Random rnd(301); + std::map values; + for (const auto& range : ranges) { + for (int32_t j = range.first; j < range.second; j++) { + values[j] = RandomString(&rnd, value_size); + ASSERT_OK(Put(Key(j), values[j])); + } + ASSERT_OK(Flush()); + } + + int32_t level0_files = NumTableFilesAtLevel(0, 0); + ASSERT_EQ(level0_files, ranges.size()); + ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0); // No files in L1 + + // Promote L0 level to L2. + ASSERT_OK(experimental::PromoteL0(db_, db_->DefaultColumnFamily(), 2)); + // We expect that all the files were trivially moved from L0 to L2 + ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0); + ASSERT_EQ(NumTableFilesAtLevel(2, 0), level0_files); + + for (const auto& kv : values) { + ASSERT_EQ(Get(Key(kv.first)), kv.second); + } +} + +TEST_F(DBTest, PromoteL0Failure) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.write_buffer_size = 10 * 1024 * 1024; + DestroyAndReopen(options); + + // Produce two L0 files with overlapping ranges. + ASSERT_OK(Put(Key(0), "")); + ASSERT_OK(Put(Key(3), "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(1), "")); + ASSERT_OK(Flush()); + + Status status; + // Fails because L0 has overlapping files. + status = experimental::PromoteL0(db_, db_->DefaultColumnFamily()); + ASSERT_TRUE(status.IsInvalidArgument()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + // Now there is a file in L1. + ASSERT_GE(NumTableFilesAtLevel(1, 0), 1); + + ASSERT_OK(Put(Key(5), "")); + ASSERT_OK(Flush()); + // Fails because L1 is non-empty. + status = experimental::PromoteL0(db_, db_->DefaultColumnFamily()); + ASSERT_TRUE(status.IsInvalidArgument()); +} + +// Github issue #596 +TEST_F(DBTest, CompactRangeWithEmptyBottomLevel) { + const int kNumLevels = 2; + const int kNumL0Files = 2; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.num_levels = kNumLevels; + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < kNumL0Files; ++i) { + ASSERT_OK(Put(Key(0), RandomString(&rnd, 1024))); + Flush(); + } + ASSERT_EQ(NumTableFilesAtLevel(0), kNumL0Files); + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1), kNumL0Files); +} +#endif // ROCKSDB_LITE + +TEST_F(DBTest, AutomaticConflictsWithManualCompaction) { + const int kNumL0Files = 50; + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 4; + // never slowdown / stop + options.level0_slowdown_writes_trigger = 999999; + options.level0_stop_writes_trigger = 999999; + options.max_background_compactions = 10; + DestroyAndReopen(options); + + // schedule automatic compactions after the manual one starts, but before it + // finishes to ensure conflict. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BackgroundCompaction:Start", + "DBTest::AutomaticConflictsWithManualCompaction:PrePuts"}, + {"DBTest::AutomaticConflictsWithManualCompaction:PostPuts", + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"}}); + std::atomic callback_count(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::MaybeScheduleFlushOrCompaction:Conflict", + [&](void* /*arg*/) { callback_count.fetch_add(1); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + for (int i = 0; i < 2; ++i) { + // put two keys to ensure no trivial move + for (int j = 0; j < 2; ++j) { + ASSERT_OK(Put(Key(j), RandomString(&rnd, 1024))); + } + ASSERT_OK(Flush()); + } + port::Thread manual_compaction_thread([this]() { + CompactRangeOptions croptions; + croptions.exclusive_manual_compaction = true; + ASSERT_OK(db_->CompactRange(croptions, nullptr, nullptr)); + }); + + TEST_SYNC_POINT("DBTest::AutomaticConflictsWithManualCompaction:PrePuts"); + for (int i = 0; i < kNumL0Files; ++i) { + // put two keys to ensure no trivial move + for (int j = 0; j < 2; ++j) { + ASSERT_OK(Put(Key(j), RandomString(&rnd, 1024))); + } + ASSERT_OK(Flush()); + } + TEST_SYNC_POINT("DBTest::AutomaticConflictsWithManualCompaction:PostPuts"); + + ASSERT_GE(callback_count.load(), 1); + for (int i = 0; i < 2; ++i) { + ASSERT_NE("NOT_FOUND", Get(Key(i))); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + manual_compaction_thread.join(); + dbfull()->TEST_WaitForCompact(); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBTest, CompactFilesShouldTriggerAutoCompaction) { + Options options = CurrentOptions(); + options.max_background_compactions = 1; + options.level0_file_num_compaction_trigger = 4; + options.level0_slowdown_writes_trigger = 36; + options.level0_stop_writes_trigger = 36; + DestroyAndReopen(options); + + // generate files for manual compaction + Random rnd(301); + for (int i = 0; i < 2; ++i) { + // put two keys to ensure no trivial move + for (int j = 0; j < 2; ++j) { + ASSERT_OK(Put(Key(j), RandomString(&rnd, 1024))); + } + ASSERT_OK(Flush()); + } + + ROCKSDB_NAMESPACE::ColumnFamilyMetaData cf_meta_data; + db_->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta_data); + + std::vector input_files; + input_files.push_back(cf_meta_data.levels[0].files[0].name); + + SyncPoint::GetInstance()->LoadDependency({ + {"CompactFilesImpl:0", + "DBTest::CompactFilesShouldTriggerAutoCompaction:Begin"}, + {"DBTest::CompactFilesShouldTriggerAutoCompaction:End", + "CompactFilesImpl:1"}, + }); + + SyncPoint::GetInstance()->EnableProcessing(); + + port::Thread manual_compaction_thread([&]() { + auto s = db_->CompactFiles(CompactionOptions(), + db_->DefaultColumnFamily(), input_files, 0); + }); + + TEST_SYNC_POINT( + "DBTest::CompactFilesShouldTriggerAutoCompaction:Begin"); + // generate enough files to trigger compaction + for (int i = 0; i < 20; ++i) { + for (int j = 0; j < 2; ++j) { + ASSERT_OK(Put(Key(j), RandomString(&rnd, 1024))); + } + ASSERT_OK(Flush()); + } + db_->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta_data); + ASSERT_GT(cf_meta_data.levels[0].files.size(), + options.level0_file_num_compaction_trigger); + TEST_SYNC_POINT( + "DBTest::CompactFilesShouldTriggerAutoCompaction:End"); + + manual_compaction_thread.join(); + dbfull()->TEST_WaitForCompact(); + + db_->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta_data); + ASSERT_LE(cf_meta_data.levels[0].files.size(), + options.level0_file_num_compaction_trigger); +} +#endif // ROCKSDB_LITE + +// Github issue #595 +// Large write batch with column families +TEST_F(DBTest, LargeBatchWithColumnFamilies) { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 100000; // Small write buffer + CreateAndReopenWithCF({"pikachu"}, options); + int64_t j = 0; + for (int i = 0; i < 5; i++) { + for (int pass = 1; pass <= 3; pass++) { + WriteBatch batch; + size_t write_size = 1024 * 1024 * (5 + i); + fprintf(stderr, "prepare: %" ROCKSDB_PRIszt " MB, pass:%d\n", + (write_size / 1024 / 1024), pass); + for (;;) { + std::string data(3000, j++ % 127 + 20); + data += ToString(j); + batch.Put(handles_[0], Slice(data), Slice(data)); + if (batch.GetDataSize() > write_size) { + break; + } + } + fprintf(stderr, "write: %" ROCKSDB_PRIszt " MB\n", + (batch.GetDataSize() / 1024 / 1024)); + ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); + fprintf(stderr, "done\n"); + } + } + // make sure we can re-open it. + ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options)); +} + +// Make sure that Flushes can proceed in parallel with CompactRange() +TEST_F(DBTest, FlushesInParallelWithCompactRange) { + // iter == 0 -- leveled + // iter == 1 -- leveled, but throw in a flush between two levels compacting + // iter == 2 -- universal + for (int iter = 0; iter < 3; ++iter) { + Options options = CurrentOptions(); + if (iter < 2) { + options.compaction_style = kCompactionStyleLevel; + } else { + options.compaction_style = kCompactionStyleUniversal; + } + options.write_buffer_size = 110 << 10; + options.level0_file_num_compaction_trigger = 4; + options.num_levels = 4; + options.compression = kNoCompression; + options.max_bytes_for_level_base = 450 << 10; + options.target_file_size_base = 98 << 10; + options.max_write_buffer_number = 2; + + DestroyAndReopen(options); + + Random rnd(301); + for (int num = 0; num < 14; num++) { + GenerateNewRandomFile(&rnd); + } + + if (iter == 1) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::RunManualCompaction()::1", + "DBTest::FlushesInParallelWithCompactRange:1"}, + {"DBTest::FlushesInParallelWithCompactRange:2", + "DBImpl::RunManualCompaction()::2"}}); + } else { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"CompactionJob::Run():Start", + "DBTest::FlushesInParallelWithCompactRange:1"}, + {"DBTest::FlushesInParallelWithCompactRange:2", + "CompactionJob::Run():End"}}); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + std::vector threads; + threads.emplace_back([&]() { Compact("a", "z"); }); + + TEST_SYNC_POINT("DBTest::FlushesInParallelWithCompactRange:1"); + + // this has to start a flush. if flushes are blocked, this will try to + // create + // 3 memtables, and that will fail because max_write_buffer_number is 2 + for (int num = 0; num < 3; num++) { + GenerateNewRandomFile(&rnd, /* nowait */ true); + } + + TEST_SYNC_POINT("DBTest::FlushesInParallelWithCompactRange:2"); + + for (auto& t : threads) { + t.join(); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } +} + +TEST_F(DBTest, DelayedWriteRate) { + const int kEntriesPerMemTable = 100; + const int kTotalFlushes = 12; + + Options options = CurrentOptions(); + env_->SetBackgroundThreads(1, Env::LOW); + options.env = env_; + env_->no_slowdown_ = true; + options.write_buffer_size = 100000000; + options.max_write_buffer_number = 256; + options.max_background_compactions = 1; + options.level0_file_num_compaction_trigger = 3; + options.level0_slowdown_writes_trigger = 3; + options.level0_stop_writes_trigger = 999999; + options.delayed_write_rate = 20000000; // Start with 200MB/s + options.memtable_factory.reset( + new SpecialSkipListFactory(kEntriesPerMemTable)); + + CreateAndReopenWithCF({"pikachu"}, options); + + // Block compactions + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + + for (int i = 0; i < 3; i++) { + Put(Key(i), std::string(10000, 'x')); + Flush(); + } + + // These writes will be slowed down to 1KB/s + uint64_t estimated_sleep_time = 0; + Random rnd(301); + Put("", ""); + uint64_t cur_rate = options.delayed_write_rate; + for (int i = 0; i < kTotalFlushes; i++) { + uint64_t size_memtable = 0; + for (int j = 0; j < kEntriesPerMemTable; j++) { + auto rand_num = rnd.Uniform(20); + // Spread the size range to more. + size_t entry_size = rand_num * rand_num * rand_num; + WriteOptions wo; + Put(Key(i), std::string(entry_size, 'x'), wo); + size_memtable += entry_size + 18; + // Occasionally sleep a while + if (rnd.Uniform(20) == 6) { + env_->SleepForMicroseconds(2666); + } + } + dbfull()->TEST_WaitForFlushMemTable(); + estimated_sleep_time += size_memtable * 1000000u / cur_rate; + // Slow down twice. One for memtable switch and one for flush finishes. + cur_rate = static_cast(static_cast(cur_rate) * + kIncSlowdownRatio * kIncSlowdownRatio); + } + // Estimate the total sleep time fall into the rough range. + ASSERT_GT(env_->addon_time_.load(), + static_cast(estimated_sleep_time / 2)); + ASSERT_LT(env_->addon_time_.load(), + static_cast(estimated_sleep_time * 2)); + + env_->no_slowdown_ = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); +} + +TEST_F(DBTest, HardLimit) { + Options options = CurrentOptions(); + options.env = env_; + env_->SetBackgroundThreads(1, Env::LOW); + options.max_write_buffer_number = 256; + options.write_buffer_size = 110 << 10; // 110KB + options.arena_block_size = 4 * 1024; + options.level0_file_num_compaction_trigger = 4; + options.level0_slowdown_writes_trigger = 999999; + options.level0_stop_writes_trigger = 999999; + options.hard_pending_compaction_bytes_limit = 800 << 10; + options.max_bytes_for_level_base = 10000000000u; + options.max_background_compactions = 1; + options.memtable_factory.reset( + new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + + env_->SetBackgroundThreads(1, Env::LOW); + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + + CreateAndReopenWithCF({"pikachu"}, options); + + std::atomic callback_count(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::DelayWrite:Wait", [&](void* /*arg*/) { + callback_count.fetch_add(1); + sleeping_task_low.WakeUp(); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + int key_idx = 0; + for (int num = 0; num < 5; num++) { + GenerateNewFile(&rnd, &key_idx, true); + dbfull()->TEST_WaitForFlushMemTable(); + } + + ASSERT_EQ(0, callback_count.load()); + + for (int num = 0; num < 5; num++) { + GenerateNewFile(&rnd, &key_idx, true); + dbfull()->TEST_WaitForFlushMemTable(); + } + ASSERT_GE(callback_count.load(), 1); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + sleeping_task_low.WaitUntilDone(); +} + +#if !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION) +class WriteStallListener : public EventListener { + public: + WriteStallListener() : condition_(WriteStallCondition::kNormal) {} + void OnStallConditionsChanged(const WriteStallInfo& info) override { + MutexLock l(&mutex_); + condition_ = info.condition.cur; + } + bool CheckCondition(WriteStallCondition expected) { + MutexLock l(&mutex_); + return expected == condition_; + } + private: + port::Mutex mutex_; + WriteStallCondition condition_; +}; + +TEST_F(DBTest, SoftLimit) { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 100000; // Small write buffer + options.max_write_buffer_number = 256; + options.level0_file_num_compaction_trigger = 1; + options.level0_slowdown_writes_trigger = 3; + options.level0_stop_writes_trigger = 999999; + options.delayed_write_rate = 20000; // About 200KB/s limited rate + options.soft_pending_compaction_bytes_limit = 160000; + options.target_file_size_base = 99999999; // All into one file + options.max_bytes_for_level_base = 50000; + options.max_bytes_for_level_multiplier = 10; + options.max_background_compactions = 1; + options.compression = kNoCompression; + WriteStallListener* listener = new WriteStallListener(); + options.listeners.emplace_back(listener); + + // FlushMemtable with opt.wait=true does not wait for + // `OnStallConditionsChanged` being called. The event listener is triggered + // on `JobContext::Clean`, which happens after flush result is installed. + // We use sync point to create a custom WaitForFlush that waits for + // context cleanup. + port::Mutex flush_mutex; + port::CondVar flush_cv(&flush_mutex); + bool flush_finished = false; + auto InstallFlushCallback = [&]() { + { + MutexLock l(&flush_mutex); + flush_finished = false; + } + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCallFlush:ContextCleanedUp", [&](void*) { + { + MutexLock l(&flush_mutex); + flush_finished = true; + } + flush_cv.SignalAll(); + }); + }; + auto WaitForFlush = [&]() { + { + MutexLock l(&flush_mutex); + while (!flush_finished) { + flush_cv.Wait(); + } + } + SyncPoint::GetInstance()->ClearCallBack( + "DBImpl::BackgroundCallFlush:ContextCleanedUp"); + }; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Reopen(options); + + // Generating 360KB in Level 3 + for (int i = 0; i < 72; i++) { + Put(Key(i), std::string(5000, 'x')); + if (i % 10 == 0) { + dbfull()->TEST_FlushMemTable(true, true); + } + } + dbfull()->TEST_WaitForCompact(); + MoveFilesToLevel(3); + + // Generating 360KB in Level 2 + for (int i = 0; i < 72; i++) { + Put(Key(i), std::string(5000, 'x')); + if (i % 10 == 0) { + dbfull()->TEST_FlushMemTable(true, true); + } + } + dbfull()->TEST_WaitForCompact(); + MoveFilesToLevel(2); + + Put(Key(0), ""); + + test::SleepingBackgroundTask sleeping_task_low; + // Block compactions + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + sleeping_task_low.WaitUntilSleeping(); + + // Create 3 L0 files, making score of L0 to be 3. + for (int i = 0; i < 3; i++) { + Put(Key(i), std::string(5000, 'x')); + Put(Key(100 - i), std::string(5000, 'x')); + // Flush the file. File size is around 30KB. + InstallFlushCallback(); + dbfull()->TEST_FlushMemTable(true, true); + WaitForFlush(); + } + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kDelayed)); + + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); + sleeping_task_low.Reset(); + dbfull()->TEST_WaitForCompact(); + + // Now there is one L1 file but doesn't trigger soft_rate_limit + // The L1 file size is around 30KB. + ASSERT_EQ(NumTableFilesAtLevel(1), 1); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kNormal)); + + // Only allow one compactin going through. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", [&](void* /*arg*/) { + // Schedule a sleeping task. + sleeping_task_low.Reset(); + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_task_low, Env::Priority::LOW); + }); + + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + sleeping_task_low.WaitUntilSleeping(); + // Create 3 L0 files, making score of L0 to be 3 + for (int i = 0; i < 3; i++) { + Put(Key(10 + i), std::string(5000, 'x')); + Put(Key(90 - i), std::string(5000, 'x')); + // Flush the file. File size is around 30KB. + InstallFlushCallback(); + dbfull()->TEST_FlushMemTable(true, true); + WaitForFlush(); + } + + // Wake up sleep task to enable compaction to run and waits + // for it to go to sleep state again to make sure one compaction + // goes through. + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilSleeping(); + + // Now there is one L1 file (around 60KB) which exceeds 50KB base by 10KB + // Given level multiplier 10, estimated pending compaction is around 100KB + // doesn't trigger soft_pending_compaction_bytes_limit + ASSERT_EQ(NumTableFilesAtLevel(1), 1); + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kNormal)); + + // Create 3 L0 files, making score of L0 to be 3, higher than L0. + for (int i = 0; i < 3; i++) { + Put(Key(20 + i), std::string(5000, 'x')); + Put(Key(80 - i), std::string(5000, 'x')); + // Flush the file. File size is around 30KB. + InstallFlushCallback(); + dbfull()->TEST_FlushMemTable(true, true); + WaitForFlush(); + } + // Wake up sleep task to enable compaction to run and waits + // for it to go to sleep state again to make sure one compaction + // goes through. + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilSleeping(); + + // Now there is one L1 file (around 90KB) which exceeds 50KB base by 40KB + // L2 size is 360KB, so the estimated level fanout 4, estimated pending + // compaction is around 200KB + // triggerring soft_pending_compaction_bytes_limit + ASSERT_EQ(NumTableFilesAtLevel(1), 1); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kDelayed)); + + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilSleeping(); + + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kNormal)); + + // shrink level base so L2 will hit soft limit easier. + ASSERT_OK(dbfull()->SetOptions({ + {"max_bytes_for_level_base", "5000"}, + })); + + Put("", ""); + Flush(); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kDelayed)); + + sleeping_task_low.WaitUntilSleeping(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); +} + +TEST_F(DBTest, LastWriteBufferDelay) { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 100000; + options.max_write_buffer_number = 4; + options.delayed_write_rate = 20000; + options.compression = kNoCompression; + options.disable_auto_compactions = true; + int kNumKeysPerMemtable = 3; + options.memtable_factory.reset( + new SpecialSkipListFactory(kNumKeysPerMemtable)); + + Reopen(options); + test::SleepingBackgroundTask sleeping_task; + // Block flushes + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task, + Env::Priority::HIGH); + sleeping_task.WaitUntilSleeping(); + + // Create 3 L0 files, making score of L0 to be 3. + for (int i = 0; i < 3; i++) { + // Fill one mem table + for (int j = 0; j < kNumKeysPerMemtable; j++) { + Put(Key(j), ""); + } + ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + } + // Inserting a new entry would create a new mem table, triggering slow down. + Put(Key(0), ""); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); + + sleeping_task.WakeUp(); + sleeping_task.WaitUntilDone(); +} +#endif // !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION) + +TEST_F(DBTest, FailWhenCompressionNotSupportedTest) { + CompressionType compressions[] = {kZlibCompression, kBZip2Compression, + kLZ4Compression, kLZ4HCCompression, + kXpressCompression}; + for (auto comp : compressions) { + if (!CompressionTypeSupported(comp)) { + // not supported, we should fail the Open() + Options options = CurrentOptions(); + options.compression = comp; + ASSERT_TRUE(!TryReopen(options).ok()); + // Try if CreateColumnFamily also fails + options.compression = kNoCompression; + ASSERT_OK(TryReopen(options)); + ColumnFamilyOptions cf_options(options); + cf_options.compression = comp; + ColumnFamilyHandle* handle; + ASSERT_TRUE(!db_->CreateColumnFamily(cf_options, "name", &handle).ok()); + } + } +} + +TEST_F(DBTest, CreateColumnFamilyShouldFailOnIncompatibleOptions) { + Options options = CurrentOptions(); + options.max_open_files = 100; + Reopen(options); + + ColumnFamilyOptions cf_options(options); + // ttl is now supported when max_open_files is -1. + cf_options.ttl = 3600; + ColumnFamilyHandle* handle; + ASSERT_OK(db_->CreateColumnFamily(cf_options, "pikachu", &handle)); + delete handle; +} + +#ifndef ROCKSDB_LITE +TEST_F(DBTest, RowCache) { + Options options = CurrentOptions(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.row_cache = NewLRUCache(8192); + DestroyAndReopen(options); + + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 0); + ASSERT_EQ(Get("foo"), "bar"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1); + ASSERT_EQ(Get("foo"), "bar"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1); +} + +TEST_F(DBTest, PinnableSliceAndRowCache) { + Options options = CurrentOptions(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.row_cache = NewLRUCache(8192); + DestroyAndReopen(options); + + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + + ASSERT_EQ(Get("foo"), "bar"); + ASSERT_EQ( + reinterpret_cast(options.row_cache.get())->TEST_GetLRUSize(), + 1); + + { + PinnableSlice pin_slice; + ASSERT_EQ(Get("foo", &pin_slice), Status::OK()); + ASSERT_EQ(pin_slice.ToString(), "bar"); + // Entry is already in cache, lookup will remove the element from lru + ASSERT_EQ( + reinterpret_cast(options.row_cache.get())->TEST_GetLRUSize(), + 0); + } + // After PinnableSlice destruction element is added back in LRU + ASSERT_EQ( + reinterpret_cast(options.row_cache.get())->TEST_GetLRUSize(), + 1); +} + +#endif // ROCKSDB_LITE + +TEST_F(DBTest, DeletingOldWalAfterDrop) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"Test:AllowFlushes", "DBImpl::BGWorkFlush"}, + {"DBImpl::BGWorkFlush:done", "Test:WaitForFlush"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + Options options = CurrentOptions(); + options.max_total_wal_size = 8192; + options.compression = kNoCompression; + options.write_buffer_size = 1 << 20; + options.level0_file_num_compaction_trigger = (1 << 30); + options.level0_slowdown_writes_trigger = (1 << 30); + options.level0_stop_writes_trigger = (1 << 30); + options.disable_auto_compactions = true; + DestroyAndReopen(options); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + CreateColumnFamilies({"cf1", "cf2"}, options); + ASSERT_OK(Put(0, "key1", DummyString(8192))); + ASSERT_OK(Put(0, "key2", DummyString(8192))); + // the oldest wal should now be getting_flushed + ASSERT_OK(db_->DropColumnFamily(handles_[0])); + // all flushes should now do nothing because their CF is dropped + TEST_SYNC_POINT("Test:AllowFlushes"); + TEST_SYNC_POINT("Test:WaitForFlush"); + uint64_t lognum1 = dbfull()->TEST_LogfileNumber(); + ASSERT_OK(Put(1, "key3", DummyString(8192))); + ASSERT_OK(Put(1, "key4", DummyString(8192))); + // new wal should have been created + uint64_t lognum2 = dbfull()->TEST_LogfileNumber(); + EXPECT_GT(lognum2, lognum1); +} + +TEST_F(DBTest, UnsupportedManualSync) { + DestroyAndReopen(CurrentOptions()); + env_->is_wal_sync_thread_safe_.store(false); + Status s = db_->SyncWAL(); + ASSERT_TRUE(s.IsNotSupported()); +} + +INSTANTIATE_TEST_CASE_P(DBTestWithParam, DBTestWithParam, + ::testing::Combine(::testing::Values(1, 4), + ::testing::Bool())); + +TEST_F(DBTest, PauseBackgroundWorkTest) { + Options options = CurrentOptions(); + options.write_buffer_size = 100000; // Small write buffer + Reopen(options); + + std::vector threads; + std::atomic done(false); + db_->PauseBackgroundWork(); + threads.emplace_back([&]() { + Random rnd(301); + for (int i = 0; i < 10000; ++i) { + Put(RandomString(&rnd, 10), RandomString(&rnd, 10)); + } + done.store(true); + }); + env_->SleepForMicroseconds(200000); + // make sure the thread is not done + ASSERT_FALSE(done.load()); + db_->ContinueBackgroundWork(); + for (auto& t : threads) { + t.join(); + } + // now it's done + ASSERT_TRUE(done.load()); +} + +// Keep spawning short-living threads that create an iterator and quit. +// Meanwhile in another thread keep flushing memtables. +// This used to cause a deadlock. +TEST_F(DBTest, ThreadLocalPtrDeadlock) { + std::atomic flushes_done{0}; + std::atomic threads_destroyed{0}; + auto done = [&] { + return flushes_done.load() > 10; + }; + + port::Thread flushing_thread([&] { + for (int i = 0; !done(); ++i) { + ASSERT_OK(db_->Put(WriteOptions(), Slice("hi"), + Slice(std::to_string(i).c_str()))); + ASSERT_OK(db_->Flush(FlushOptions())); + int cnt = ++flushes_done; + fprintf(stderr, "Flushed %d times\n", cnt); + } + }); + + std::vector thread_spawning_threads(10); + for (auto& t: thread_spawning_threads) { + t = port::Thread([&] { + while (!done()) { + { + port::Thread tmp_thread([&] { + auto it = db_->NewIterator(ReadOptions()); + delete it; + }); + tmp_thread.join(); + } + ++threads_destroyed; + } + }); + } + + for (auto& t: thread_spawning_threads) { + t.join(); + } + flushing_thread.join(); + fprintf(stderr, "Done. Flushed %d times, destroyed %d threads\n", + flushes_done.load(), threads_destroyed.load()); +} + +TEST_F(DBTest, LargeBlockSizeTest) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_OK(Put(0, "foo", "bar")); + BlockBasedTableOptions table_options; + table_options.block_size = 8LL * 1024 * 1024 * 1024LL; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + ASSERT_NOK(TryReopenWithColumnFamilies({"default", "pikachu"}, options)); +} + +#ifndef ROCKSDB_LITE + +TEST_F(DBTest, CreationTimeOfOldestFile) { + const int kNumKeysPerFile = 32; + const int kNumLevelFiles = 2; + const int kValueSize = 100; + + Options options = CurrentOptions(); + options.max_open_files = -1; + env_->time_elapse_only_sleep_ = false; + options.env = env_; + + env_->addon_time_.store(0); + DestroyAndReopen(options); + + bool set_file_creation_time_to_zero = true; + int idx = 0; + + int64_t time_1 = 0; + env_->GetCurrentTime(&time_1); + const uint64_t uint_time_1 = static_cast(time_1); + + // Add 50 hours + env_->addon_time_.fetch_add(50 * 60 * 60); + + int64_t time_2 = 0; + env_->GetCurrentTime(&time_2); + const uint64_t uint_time_2 = static_cast(time_2); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "PropertyBlockBuilder::AddTableProperty:Start", [&](void* arg) { + TableProperties* props = reinterpret_cast(arg); + if (set_file_creation_time_to_zero) { + if (idx == 0) { + props->file_creation_time = 0; + idx++; + } else if (idx == 1) { + props->file_creation_time = uint_time_1; + idx = 0; + } + } else { + if (idx == 0) { + props->file_creation_time = uint_time_1; + idx++; + } else if (idx == 1) { + props->file_creation_time = uint_time_2; + } + } + }); + // Set file creation time in manifest all to 0. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "FileMetaData::FileMetaData", [&](void* arg) { + FileMetaData* meta = static_cast(arg); + meta->file_creation_time = 0; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK( + Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize))); + } + Flush(); + } + + // At this point there should be 2 files, one with file_creation_time = 0 and + // the other non-zero. GetCreationTimeOfOldestFile API should return 0. + uint64_t creation_time; + Status s1 = dbfull()->GetCreationTimeOfOldestFile(&creation_time); + ASSERT_EQ(0, creation_time); + ASSERT_EQ(s1, Status::OK()); + + // Testing with non-zero file creation time. + set_file_creation_time_to_zero = false; + options = CurrentOptions(); + options.max_open_files = -1; + env_->time_elapse_only_sleep_ = false; + options.env = env_; + + env_->addon_time_.store(0); + DestroyAndReopen(options); + + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK( + Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize))); + } + Flush(); + } + + // At this point there should be 2 files with non-zero file creation time. + // GetCreationTimeOfOldestFile API should return non-zero value. + uint64_t ctime; + Status s2 = dbfull()->GetCreationTimeOfOldestFile(&ctime); + ASSERT_EQ(uint_time_1, ctime); + ASSERT_EQ(s2, Status::OK()); + + // Testing with max_open_files != -1 + options = CurrentOptions(); + options.max_open_files = 10; + DestroyAndReopen(options); + Status s3 = dbfull()->GetCreationTimeOfOldestFile(&ctime); + ASSERT_EQ(s3, Status::NotSupported()); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +#endif + +} // namespace ROCKSDB_NAMESPACE + +#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS +extern "C" { +void RegisterCustomObjects(int argc, char** argv); +} +#else +void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {} +#endif // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_test2.cc b/src/rocksdb/db/db_test2.cc new file mode 100644 index 000000000..f4e8e960a --- /dev/null +++ b/src/rocksdb/db/db_test2.cc @@ -0,0 +1,4695 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include +#include +#include + +#include "db/db_test_util.h" +#include "db/read_callback.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/persistent_cache.h" +#include "rocksdb/wal_filter.h" +#include "test_util/fault_injection_test_env.h" + +namespace ROCKSDB_NAMESPACE { + +class DBTest2 : public DBTestBase { + public: + DBTest2() : DBTestBase("/db_test2") {} +}; + +class PrefixFullBloomWithReverseComparator + : public DBTestBase, + public ::testing::WithParamInterface { + public: + PrefixFullBloomWithReverseComparator() + : DBTestBase("/prefix_bloom_reverse") {} + void SetUp() override { if_cache_filter_ = GetParam(); } + bool if_cache_filter_; +}; + +TEST_P(PrefixFullBloomWithReverseComparator, + PrefixFullBloomWithReverseComparator) { + Options options = last_options_; + options.comparator = ReverseBytewiseComparator(); + options.prefix_extractor.reset(NewCappedPrefixTransform(3)); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + BlockBasedTableOptions bbto; + if (if_cache_filter_) { + bbto.no_block_cache = false; + bbto.cache_index_and_filter_blocks = true; + bbto.block_cache = NewLRUCache(1); + } + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + ASSERT_OK(dbfull()->Put(WriteOptions(), "bar123", "foo")); + ASSERT_OK(dbfull()->Put(WriteOptions(), "bar234", "foo2")); + ASSERT_OK(dbfull()->Put(WriteOptions(), "foo123", "foo3")); + + dbfull()->Flush(FlushOptions()); + + if (bbto.block_cache) { + bbto.block_cache->EraseUnRefEntries(); + } + + std::unique_ptr iter(db_->NewIterator(ReadOptions())); + iter->Seek("bar345"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bar234", iter->key().ToString()); + ASSERT_EQ("foo2", iter->value().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bar123", iter->key().ToString()); + ASSERT_EQ("foo", iter->value().ToString()); + + iter->Seek("foo234"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo123", iter->key().ToString()); + ASSERT_EQ("foo3", iter->value().ToString()); + + iter->Seek("bar"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); +} + +INSTANTIATE_TEST_CASE_P(PrefixFullBloomWithReverseComparator, + PrefixFullBloomWithReverseComparator, testing::Bool()); + +TEST_F(DBTest2, IteratorPropertyVersionNumber) { + Put("", ""); + Iterator* iter1 = db_->NewIterator(ReadOptions()); + std::string prop_value; + ASSERT_OK( + iter1->GetProperty("rocksdb.iterator.super-version-number", &prop_value)); + uint64_t version_number1 = + static_cast(std::atoi(prop_value.c_str())); + + Put("", ""); + Flush(); + + Iterator* iter2 = db_->NewIterator(ReadOptions()); + ASSERT_OK( + iter2->GetProperty("rocksdb.iterator.super-version-number", &prop_value)); + uint64_t version_number2 = + static_cast(std::atoi(prop_value.c_str())); + + ASSERT_GT(version_number2, version_number1); + + Put("", ""); + + Iterator* iter3 = db_->NewIterator(ReadOptions()); + ASSERT_OK( + iter3->GetProperty("rocksdb.iterator.super-version-number", &prop_value)); + uint64_t version_number3 = + static_cast(std::atoi(prop_value.c_str())); + + ASSERT_EQ(version_number2, version_number3); + + iter1->SeekToFirst(); + ASSERT_OK( + iter1->GetProperty("rocksdb.iterator.super-version-number", &prop_value)); + uint64_t version_number1_new = + static_cast(std::atoi(prop_value.c_str())); + ASSERT_EQ(version_number1, version_number1_new); + + delete iter1; + delete iter2; + delete iter3; +} + +TEST_F(DBTest2, CacheIndexAndFilterWithDBRestart) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(20)); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"pikachu"}, options); + + Put(1, "a", "begin"); + Put(1, "z", "end"); + ASSERT_OK(Flush(1)); + TryReopenWithColumnFamilies({"default", "pikachu"}, options); + + std::string value; + value = Get(1, "a"); +} + +TEST_F(DBTest2, MaxSuccessiveMergesChangeWithDBRecovery) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.max_successive_merges = 3; + options.merge_operator = MergeOperators::CreatePutOperator(); + options.disable_auto_compactions = true; + DestroyAndReopen(options); + Put("poi", "Finch"); + db_->Merge(WriteOptions(), "poi", "Reese"); + db_->Merge(WriteOptions(), "poi", "Shaw"); + db_->Merge(WriteOptions(), "poi", "Root"); + options.max_successive_merges = 2; + Reopen(options); +} + +#ifndef ROCKSDB_LITE +class DBTestSharedWriteBufferAcrossCFs + : public DBTestBase, + public testing::WithParamInterface> { + public: + DBTestSharedWriteBufferAcrossCFs() + : DBTestBase("/db_test_shared_write_buffer") {} + void SetUp() override { + use_old_interface_ = std::get<0>(GetParam()); + cost_cache_ = std::get<1>(GetParam()); + } + bool use_old_interface_; + bool cost_cache_; +}; + +TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) { + Options options = CurrentOptions(); + options.arena_block_size = 4096; + + // Avoid undeterministic value by malloc_usable_size(); + // Force arena block size to 1 + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "Arena::Arena:0", [&](void* arg) { + size_t* block_size = static_cast(arg); + *block_size = 1; + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "Arena::AllocateNewBlock:0", [&](void* arg) { + std::pair* pair = + static_cast*>(arg); + *std::get<0>(*pair) = *std::get<1>(*pair); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // The total soft write buffer size is about 105000 + std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); + ASSERT_LT(cache->GetUsage(), 256 * 1024); + + if (use_old_interface_) { + options.db_write_buffer_size = 120000; // this is the real limit + } else if (!cost_cache_) { + options.write_buffer_manager.reset(new WriteBufferManager(114285)); + } else { + options.write_buffer_manager.reset(new WriteBufferManager(114285, cache)); + } + options.write_buffer_size = 500000; // this is never hit + CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options); + + WriteOptions wo; + wo.disableWAL = true; + + std::function wait_flush = [&]() { + dbfull()->TEST_WaitForFlushMemTable(handles_[0]); + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + dbfull()->TEST_WaitForFlushMemTable(handles_[2]); + dbfull()->TEST_WaitForFlushMemTable(handles_[3]); + }; + + // Create some data and flush "default" and "nikitich" so that they + // are newer CFs created. + ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); + Flush(3); + ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); + ASSERT_OK(Put(0, Key(1), DummyString(1), wo)); + Flush(0); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(1)); + + ASSERT_OK(Put(3, Key(1), DummyString(30000), wo)); + if (cost_cache_) { + ASSERT_GE(cache->GetUsage(), 256 * 1024); + ASSERT_LE(cache->GetUsage(), 2 * 256 * 1024); + } + wait_flush(); + ASSERT_OK(Put(0, Key(1), DummyString(60000), wo)); + if (cost_cache_) { + ASSERT_GE(cache->GetUsage(), 256 * 1024); + ASSERT_LE(cache->GetUsage(), 2 * 256 * 1024); + } + wait_flush(); + ASSERT_OK(Put(2, Key(1), DummyString(1), wo)); + // No flush should trigger + wait_flush(); + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(1)); + } + + // Trigger a flush. Flushing "nikitich". + ASSERT_OK(Put(3, Key(2), DummyString(30000), wo)); + wait_flush(); + ASSERT_OK(Put(0, Key(1), DummyString(1), wo)); + wait_flush(); + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(2)); + } + + // Without hitting the threshold, no flush should trigger. + ASSERT_OK(Put(2, Key(1), DummyString(30000), wo)); + wait_flush(); + ASSERT_OK(Put(2, Key(1), DummyString(1), wo)); + wait_flush(); + ASSERT_OK(Put(2, Key(1), DummyString(1), wo)); + wait_flush(); + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(2)); + } + + // Hit the write buffer limit again. "default" + // will have been flushed. + ASSERT_OK(Put(2, Key(2), DummyString(10000), wo)); + wait_flush(); + ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); + wait_flush(); + ASSERT_OK(Put(0, Key(1), DummyString(1), wo)); + wait_flush(); + ASSERT_OK(Put(0, Key(1), DummyString(1), wo)); + wait_flush(); + ASSERT_OK(Put(0, Key(1), DummyString(1), wo)); + wait_flush(); + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(2)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(2)); + } + + // Trigger another flush. This time "dobrynia". "pikachu" should not + // be flushed, althrough it was never flushed. + ASSERT_OK(Put(1, Key(1), DummyString(1), wo)); + wait_flush(); + ASSERT_OK(Put(2, Key(1), DummyString(80000), wo)); + wait_flush(); + ASSERT_OK(Put(1, Key(1), DummyString(1), wo)); + wait_flush(); + ASSERT_OK(Put(2, Key(1), DummyString(1), wo)); + wait_flush(); + + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(2)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(2)); + } + if (cost_cache_) { + ASSERT_GE(cache->GetUsage(), 256 * 1024); + Close(); + options.write_buffer_manager.reset(); + last_options_.write_buffer_manager.reset(); + ASSERT_LT(cache->GetUsage(), 256 * 1024); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +INSTANTIATE_TEST_CASE_P(DBTestSharedWriteBufferAcrossCFs, + DBTestSharedWriteBufferAcrossCFs, + ::testing::Values(std::make_tuple(true, false), + std::make_tuple(false, false), + std::make_tuple(false, true))); + +TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) { + std::string dbname2 = test::PerThreadDBPath("db_shared_wb_db2"); + Options options = CurrentOptions(); + options.arena_block_size = 4096; + // Avoid undeterministic value by malloc_usable_size(); + // Force arena block size to 1 + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "Arena::Arena:0", [&](void* arg) { + size_t* block_size = static_cast(arg); + *block_size = 1; + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "Arena::AllocateNewBlock:0", [&](void* arg) { + std::pair* pair = + static_cast*>(arg); + *std::get<0>(*pair) = *std::get<1>(*pair); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + options.write_buffer_size = 500000; // this is never hit + // Use a write buffer total size so that the soft limit is about + // 105000. + options.write_buffer_manager.reset(new WriteBufferManager(120000)); + CreateAndReopenWithCF({"cf1", "cf2"}, options); + + ASSERT_OK(DestroyDB(dbname2, options)); + DB* db2 = nullptr; + ASSERT_OK(DB::Open(options, dbname2, &db2)); + + WriteOptions wo; + wo.disableWAL = true; + + std::function wait_flush = [&]() { + dbfull()->TEST_WaitForFlushMemTable(handles_[0]); + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + dbfull()->TEST_WaitForFlushMemTable(handles_[2]); + static_cast(db2)->TEST_WaitForFlushMemTable(); + }; + + // Trigger a flush on cf2 + ASSERT_OK(Put(2, Key(1), DummyString(70000), wo)); + wait_flush(); + ASSERT_OK(Put(0, Key(1), DummyString(20000), wo)); + wait_flush(); + + // Insert to DB2 + ASSERT_OK(db2->Put(wo, Key(2), DummyString(20000))); + wait_flush(); + + ASSERT_OK(Put(2, Key(1), DummyString(1), wo)); + wait_flush(); + static_cast(db2)->TEST_WaitForFlushMemTable(); + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default") + + GetNumberOfSstFilesForColumnFamily(db_, "cf1") + + GetNumberOfSstFilesForColumnFamily(db_, "cf2"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"), + static_cast(0)); + } + + // Triggering to flush another CF in DB1 + ASSERT_OK(db2->Put(wo, Key(2), DummyString(70000))); + wait_flush(); + ASSERT_OK(Put(2, Key(1), DummyString(1), wo)); + wait_flush(); + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf2"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"), + static_cast(0)); + } + + // Triggering flush in DB2. + ASSERT_OK(db2->Put(wo, Key(3), DummyString(40000))); + wait_flush(); + ASSERT_OK(db2->Put(wo, Key(1), DummyString(1))); + wait_flush(); + static_cast(db2)->TEST_WaitForFlushMemTable(); + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf2"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"), + static_cast(1)); + } + + delete db2; + ASSERT_OK(DestroyDB(dbname2, options)); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTest2, TestWriteBufferNoLimitWithCache) { + Options options = CurrentOptions(); + options.arena_block_size = 4096; + std::shared_ptr cache = + NewLRUCache(LRUCacheOptions(10000000, 1, false, 0.0)); + options.write_buffer_size = 50000; // this is never hit + // Use a write buffer total size so that the soft limit is about + // 105000. + options.write_buffer_manager.reset(new WriteBufferManager(0, cache)); + Reopen(options); + + ASSERT_OK(Put("foo", "bar")); + // One dummy entry is 256KB. + ASSERT_GT(cache->GetUsage(), 128000); +} + +namespace { + void ValidateKeyExistence(DB* db, const std::vector& keys_must_exist, + const std::vector& keys_must_not_exist) { + // Ensure that expected keys exist + std::vector values; + if (keys_must_exist.size() > 0) { + std::vector status_list = + db->MultiGet(ReadOptions(), keys_must_exist, &values); + for (size_t i = 0; i < keys_must_exist.size(); i++) { + ASSERT_OK(status_list[i]); + } + } + + // Ensure that given keys don't exist + if (keys_must_not_exist.size() > 0) { + std::vector status_list = + db->MultiGet(ReadOptions(), keys_must_not_exist, &values); + for (size_t i = 0; i < keys_must_not_exist.size(); i++) { + ASSERT_TRUE(status_list[i].IsNotFound()); + } + } + } + +} // namespace + +TEST_F(DBTest2, WalFilterTest) { + class TestWalFilter : public WalFilter { + private: + // Processing option that is requested to be applied at the given index + WalFilter::WalProcessingOption wal_processing_option_; + // Index at which to apply wal_processing_option_ + // At other indexes default wal_processing_option::kContinueProcessing is + // returned. + size_t apply_option_at_record_index_; + // Current record index, incremented with each record encountered. + size_t current_record_index_; + + public: + TestWalFilter(WalFilter::WalProcessingOption wal_processing_option, + size_t apply_option_for_record_index) + : wal_processing_option_(wal_processing_option), + apply_option_at_record_index_(apply_option_for_record_index), + current_record_index_(0) {} + + WalProcessingOption LogRecord(const WriteBatch& /*batch*/, + WriteBatch* /*new_batch*/, + bool* /*batch_changed*/) const override { + WalFilter::WalProcessingOption option_to_return; + + if (current_record_index_ == apply_option_at_record_index_) { + option_to_return = wal_processing_option_; + } + else { + option_to_return = WalProcessingOption::kContinueProcessing; + } + + // Filter is passed as a const object for RocksDB to not modify the + // object, however we modify it for our own purpose here and hence + // cast the constness away. + (const_cast(this)->current_record_index_)++; + + return option_to_return; + } + + const char* Name() const override { return "TestWalFilter"; } + }; + + // Create 3 batches with two keys each + std::vector> batch_keys(3); + + batch_keys[0].push_back("key1"); + batch_keys[0].push_back("key2"); + batch_keys[1].push_back("key3"); + batch_keys[1].push_back("key4"); + batch_keys[2].push_back("key5"); + batch_keys[2].push_back("key6"); + + // Test with all WAL processing options + for (int option = 0; + option < static_cast( + WalFilter::WalProcessingOption::kWalProcessingOptionMax); + option++) { + Options options = OptionsForLogIterTest(); + DestroyAndReopen(options); + CreateAndReopenWithCF({ "pikachu" }, options); + + // Write given keys in given batches + for (size_t i = 0; i < batch_keys.size(); i++) { + WriteBatch batch; + for (size_t j = 0; j < batch_keys[i].size(); j++) { + batch.Put(handles_[0], batch_keys[i][j], DummyString(1024)); + } + dbfull()->Write(WriteOptions(), &batch); + } + + WalFilter::WalProcessingOption wal_processing_option = + static_cast(option); + + // Create a test filter that would apply wal_processing_option at the first + // record + size_t apply_option_for_record_index = 1; + TestWalFilter test_wal_filter(wal_processing_option, + apply_option_for_record_index); + + // Reopen database with option to use WAL filter + options = OptionsForLogIterTest(); + options.wal_filter = &test_wal_filter; + Status status = + TryReopenWithColumnFamilies({ "default", "pikachu" }, options); + if (wal_processing_option == + WalFilter::WalProcessingOption::kCorruptedRecord) { + assert(!status.ok()); + // In case of corruption we can turn off paranoid_checks to reopen + // databse + options.paranoid_checks = false; + ReopenWithColumnFamilies({ "default", "pikachu" }, options); + } + else { + assert(status.ok()); + } + + // Compute which keys we expect to be found + // and which we expect not to be found after recovery. + std::vector keys_must_exist; + std::vector keys_must_not_exist; + switch (wal_processing_option) { + case WalFilter::WalProcessingOption::kCorruptedRecord: + case WalFilter::WalProcessingOption::kContinueProcessing: { + fprintf(stderr, "Testing with complete WAL processing\n"); + // we expect all records to be processed + for (size_t i = 0; i < batch_keys.size(); i++) { + for (size_t j = 0; j < batch_keys[i].size(); j++) { + keys_must_exist.push_back(Slice(batch_keys[i][j])); + } + } + break; + } + case WalFilter::WalProcessingOption::kIgnoreCurrentRecord: { + fprintf(stderr, + "Testing with ignoring record %" ROCKSDB_PRIszt " only\n", + apply_option_for_record_index); + // We expect the record with apply_option_for_record_index to be not + // found. + for (size_t i = 0; i < batch_keys.size(); i++) { + for (size_t j = 0; j < batch_keys[i].size(); j++) { + if (i == apply_option_for_record_index) { + keys_must_not_exist.push_back(Slice(batch_keys[i][j])); + } + else { + keys_must_exist.push_back(Slice(batch_keys[i][j])); + } + } + } + break; + } + case WalFilter::WalProcessingOption::kStopReplay: { + fprintf(stderr, + "Testing with stopping replay from record %" ROCKSDB_PRIszt + "\n", + apply_option_for_record_index); + // We expect records beyond apply_option_for_record_index to be not + // found. + for (size_t i = 0; i < batch_keys.size(); i++) { + for (size_t j = 0; j < batch_keys[i].size(); j++) { + if (i >= apply_option_for_record_index) { + keys_must_not_exist.push_back(Slice(batch_keys[i][j])); + } + else { + keys_must_exist.push_back(Slice(batch_keys[i][j])); + } + } + } + break; + } + default: + assert(false); // unhandled case + } + + bool checked_after_reopen = false; + + while (true) { + // Ensure that expected keys exists + // and not expected keys don't exist after recovery + ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist); + + if (checked_after_reopen) { + break; + } + + // reopen database again to make sure previous log(s) are not used + //(even if they were skipped) + // reopn database with option to use WAL filter + options = OptionsForLogIterTest(); + ReopenWithColumnFamilies({ "default", "pikachu" }, options); + + checked_after_reopen = true; + } + } +} + +TEST_F(DBTest2, WalFilterTestWithChangeBatch) { + class ChangeBatchHandler : public WriteBatch::Handler { + private: + // Batch to insert keys in + WriteBatch* new_write_batch_; + // Number of keys to add in the new batch + size_t num_keys_to_add_in_new_batch_; + // Number of keys added to new batch + size_t num_keys_added_; + + public: + ChangeBatchHandler(WriteBatch* new_write_batch, + size_t num_keys_to_add_in_new_batch) + : new_write_batch_(new_write_batch), + num_keys_to_add_in_new_batch_(num_keys_to_add_in_new_batch), + num_keys_added_(0) {} + void Put(const Slice& key, const Slice& value) override { + if (num_keys_added_ < num_keys_to_add_in_new_batch_) { + new_write_batch_->Put(key, value); + ++num_keys_added_; + } + } + }; + + class TestWalFilterWithChangeBatch : public WalFilter { + private: + // Index at which to start changing records + size_t change_records_from_index_; + // Number of keys to add in the new batch + size_t num_keys_to_add_in_new_batch_; + // Current record index, incremented with each record encountered. + size_t current_record_index_; + + public: + TestWalFilterWithChangeBatch(size_t change_records_from_index, + size_t num_keys_to_add_in_new_batch) + : change_records_from_index_(change_records_from_index), + num_keys_to_add_in_new_batch_(num_keys_to_add_in_new_batch), + current_record_index_(0) {} + + WalProcessingOption LogRecord(const WriteBatch& batch, + WriteBatch* new_batch, + bool* batch_changed) const override { + if (current_record_index_ >= change_records_from_index_) { + ChangeBatchHandler handler(new_batch, num_keys_to_add_in_new_batch_); + batch.Iterate(&handler); + *batch_changed = true; + } + + // Filter is passed as a const object for RocksDB to not modify the + // object, however we modify it for our own purpose here and hence + // cast the constness away. + (const_cast(this) + ->current_record_index_)++; + + return WalProcessingOption::kContinueProcessing; + } + + const char* Name() const override { return "TestWalFilterWithChangeBatch"; } + }; + + std::vector> batch_keys(3); + + batch_keys[0].push_back("key1"); + batch_keys[0].push_back("key2"); + batch_keys[1].push_back("key3"); + batch_keys[1].push_back("key4"); + batch_keys[2].push_back("key5"); + batch_keys[2].push_back("key6"); + + Options options = OptionsForLogIterTest(); + DestroyAndReopen(options); + CreateAndReopenWithCF({ "pikachu" }, options); + + // Write given keys in given batches + for (size_t i = 0; i < batch_keys.size(); i++) { + WriteBatch batch; + for (size_t j = 0; j < batch_keys[i].size(); j++) { + batch.Put(handles_[0], batch_keys[i][j], DummyString(1024)); + } + dbfull()->Write(WriteOptions(), &batch); + } + + // Create a test filter that would apply wal_processing_option at the first + // record + size_t change_records_from_index = 1; + size_t num_keys_to_add_in_new_batch = 1; + TestWalFilterWithChangeBatch test_wal_filter_with_change_batch( + change_records_from_index, num_keys_to_add_in_new_batch); + + // Reopen database with option to use WAL filter + options = OptionsForLogIterTest(); + options.wal_filter = &test_wal_filter_with_change_batch; + ReopenWithColumnFamilies({ "default", "pikachu" }, options); + + // Ensure that all keys exist before change_records_from_index_ + // And after that index only single key exists + // as our filter adds only single key for each batch + std::vector keys_must_exist; + std::vector keys_must_not_exist; + + for (size_t i = 0; i < batch_keys.size(); i++) { + for (size_t j = 0; j < batch_keys[i].size(); j++) { + if (i >= change_records_from_index && j >= num_keys_to_add_in_new_batch) { + keys_must_not_exist.push_back(Slice(batch_keys[i][j])); + } + else { + keys_must_exist.push_back(Slice(batch_keys[i][j])); + } + } + } + + bool checked_after_reopen = false; + + while (true) { + // Ensure that expected keys exists + // and not expected keys don't exist after recovery + ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist); + + if (checked_after_reopen) { + break; + } + + // reopen database again to make sure previous log(s) are not used + //(even if they were skipped) + // reopn database with option to use WAL filter + options = OptionsForLogIterTest(); + ReopenWithColumnFamilies({ "default", "pikachu" }, options); + + checked_after_reopen = true; + } +} + +TEST_F(DBTest2, WalFilterTestWithChangeBatchExtraKeys) { + class TestWalFilterWithChangeBatchAddExtraKeys : public WalFilter { + public: + WalProcessingOption LogRecord(const WriteBatch& batch, WriteBatch* new_batch, + bool* batch_changed) const override { + *new_batch = batch; + new_batch->Put("key_extra", "value_extra"); + *batch_changed = true; + return WalProcessingOption::kContinueProcessing; + } + + const char* Name() const override { + return "WalFilterTestWithChangeBatchExtraKeys"; + } + }; + + std::vector> batch_keys(3); + + batch_keys[0].push_back("key1"); + batch_keys[0].push_back("key2"); + batch_keys[1].push_back("key3"); + batch_keys[1].push_back("key4"); + batch_keys[2].push_back("key5"); + batch_keys[2].push_back("key6"); + + Options options = OptionsForLogIterTest(); + DestroyAndReopen(options); + CreateAndReopenWithCF({ "pikachu" }, options); + + // Write given keys in given batches + for (size_t i = 0; i < batch_keys.size(); i++) { + WriteBatch batch; + for (size_t j = 0; j < batch_keys[i].size(); j++) { + batch.Put(handles_[0], batch_keys[i][j], DummyString(1024)); + } + dbfull()->Write(WriteOptions(), &batch); + } + + // Create a test filter that would add extra keys + TestWalFilterWithChangeBatchAddExtraKeys test_wal_filter_extra_keys; + + // Reopen database with option to use WAL filter + options = OptionsForLogIterTest(); + options.wal_filter = &test_wal_filter_extra_keys; + Status status = TryReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_TRUE(status.IsNotSupported()); + + // Reopen without filter, now reopen should succeed - previous + // attempt to open must not have altered the db. + options = OptionsForLogIterTest(); + ReopenWithColumnFamilies({ "default", "pikachu" }, options); + + std::vector keys_must_exist; + std::vector keys_must_not_exist; // empty vector + + for (size_t i = 0; i < batch_keys.size(); i++) { + for (size_t j = 0; j < batch_keys[i].size(); j++) { + keys_must_exist.push_back(Slice(batch_keys[i][j])); + } + } + + ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist); +} + +TEST_F(DBTest2, WalFilterTestWithColumnFamilies) { + class TestWalFilterWithColumnFamilies : public WalFilter { + private: + // column_family_id -> log_number map (provided to WALFilter) + std::map cf_log_number_map_; + // column_family_name -> column_family_id map (provided to WALFilter) + std::map cf_name_id_map_; + // column_family_name -> keys_found_in_wal map + // We store keys that are applicable to the column_family + // during recovery (i.e. aren't already flushed to SST file(s)) + // for verification against the keys we expect. + std::map> cf_wal_keys_; + public: + void ColumnFamilyLogNumberMap( + const std::map& cf_lognumber_map, + const std::map& cf_name_id_map) override { + cf_log_number_map_ = cf_lognumber_map; + cf_name_id_map_ = cf_name_id_map; + } + + WalProcessingOption LogRecordFound(unsigned long long log_number, + const std::string& /*log_file_name*/, + const WriteBatch& batch, + WriteBatch* /*new_batch*/, + bool* /*batch_changed*/) override { + class LogRecordBatchHandler : public WriteBatch::Handler { + private: + const std::map & cf_log_number_map_; + std::map> & cf_wal_keys_; + unsigned long long log_number_; + public: + LogRecordBatchHandler(unsigned long long current_log_number, + const std::map & cf_log_number_map, + std::map> & cf_wal_keys) : + cf_log_number_map_(cf_log_number_map), + cf_wal_keys_(cf_wal_keys), + log_number_(current_log_number){} + + Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& /*value*/) override { + auto it = cf_log_number_map_.find(column_family_id); + assert(it != cf_log_number_map_.end()); + unsigned long long log_number_for_cf = it->second; + // If the current record is applicable for column_family_id + // (i.e. isn't flushed to SST file(s) for column_family_id) + // add it to the cf_wal_keys_ map for verification. + if (log_number_ >= log_number_for_cf) { + cf_wal_keys_[column_family_id].push_back(std::string(key.data(), + key.size())); + } + return Status::OK(); + } + } handler(log_number, cf_log_number_map_, cf_wal_keys_); + + batch.Iterate(&handler); + + return WalProcessingOption::kContinueProcessing; + } + + const char* Name() const override { + return "WalFilterTestWithColumnFamilies"; + } + + const std::map>& GetColumnFamilyKeys() { + return cf_wal_keys_; + } + + const std::map & GetColumnFamilyNameIdMap() { + return cf_name_id_map_; + } + }; + + std::vector> batch_keys_pre_flush(3); + + batch_keys_pre_flush[0].push_back("key1"); + batch_keys_pre_flush[0].push_back("key2"); + batch_keys_pre_flush[1].push_back("key3"); + batch_keys_pre_flush[1].push_back("key4"); + batch_keys_pre_flush[2].push_back("key5"); + batch_keys_pre_flush[2].push_back("key6"); + + Options options = OptionsForLogIterTest(); + DestroyAndReopen(options); + CreateAndReopenWithCF({ "pikachu" }, options); + + // Write given keys in given batches + for (size_t i = 0; i < batch_keys_pre_flush.size(); i++) { + WriteBatch batch; + for (size_t j = 0; j < batch_keys_pre_flush[i].size(); j++) { + batch.Put(handles_[0], batch_keys_pre_flush[i][j], DummyString(1024)); + batch.Put(handles_[1], batch_keys_pre_flush[i][j], DummyString(1024)); + } + dbfull()->Write(WriteOptions(), &batch); + } + + //Flush default column-family + db_->Flush(FlushOptions(), handles_[0]); + + // Do some more writes + std::vector> batch_keys_post_flush(3); + + batch_keys_post_flush[0].push_back("key7"); + batch_keys_post_flush[0].push_back("key8"); + batch_keys_post_flush[1].push_back("key9"); + batch_keys_post_flush[1].push_back("key10"); + batch_keys_post_flush[2].push_back("key11"); + batch_keys_post_flush[2].push_back("key12"); + + // Write given keys in given batches + for (size_t i = 0; i < batch_keys_post_flush.size(); i++) { + WriteBatch batch; + for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) { + batch.Put(handles_[0], batch_keys_post_flush[i][j], DummyString(1024)); + batch.Put(handles_[1], batch_keys_post_flush[i][j], DummyString(1024)); + } + dbfull()->Write(WriteOptions(), &batch); + } + + // On Recovery we should only find the second batch applicable to default CF + // But both batches applicable to pikachu CF + + // Create a test filter that would add extra keys + TestWalFilterWithColumnFamilies test_wal_filter_column_families; + + // Reopen database with option to use WAL filter + options = OptionsForLogIterTest(); + options.wal_filter = &test_wal_filter_column_families; + Status status = + TryReopenWithColumnFamilies({ "default", "pikachu" }, options); + ASSERT_TRUE(status.ok()); + + // verify that handles_[0] only has post_flush keys + // while handles_[1] has pre and post flush keys + auto cf_wal_keys = test_wal_filter_column_families.GetColumnFamilyKeys(); + auto name_id_map = test_wal_filter_column_families.GetColumnFamilyNameIdMap(); + size_t index = 0; + auto keys_cf = cf_wal_keys[name_id_map[kDefaultColumnFamilyName]]; + //default column-family, only post_flush keys are expected + for (size_t i = 0; i < batch_keys_post_flush.size(); i++) { + for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) { + Slice key_from_the_log(keys_cf[index++]); + Slice batch_key(batch_keys_post_flush[i][j]); + ASSERT_TRUE(key_from_the_log.compare(batch_key) == 0); + } + } + ASSERT_TRUE(index == keys_cf.size()); + + index = 0; + keys_cf = cf_wal_keys[name_id_map["pikachu"]]; + //pikachu column-family, all keys are expected + for (size_t i = 0; i < batch_keys_pre_flush.size(); i++) { + for (size_t j = 0; j < batch_keys_pre_flush[i].size(); j++) { + Slice key_from_the_log(keys_cf[index++]); + Slice batch_key(batch_keys_pre_flush[i][j]); + ASSERT_TRUE(key_from_the_log.compare(batch_key) == 0); + } + } + + for (size_t i = 0; i < batch_keys_post_flush.size(); i++) { + for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) { + Slice key_from_the_log(keys_cf[index++]); + Slice batch_key(batch_keys_post_flush[i][j]); + ASSERT_TRUE(key_from_the_log.compare(batch_key) == 0); + } + } + ASSERT_TRUE(index == keys_cf.size()); +} + +TEST_F(DBTest2, PresetCompressionDict) { + // Verifies that compression ratio improves when dictionary is enabled, and + // improves even further when the dictionary is trained by ZSTD. + const size_t kBlockSizeBytes = 4 << 10; + const size_t kL0FileBytes = 128 << 10; + const size_t kApproxPerBlockOverheadBytes = 50; + const int kNumL0Files = 5; + + Options options; + // Make sure to use any custom env that the test is configured with. + options.env = CurrentOptions().env; + options.allow_concurrent_memtable_write = false; + options.arena_block_size = kBlockSizeBytes; + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.level0_file_num_compaction_trigger = kNumL0Files; + options.memtable_factory.reset( + new SpecialSkipListFactory(kL0FileBytes / kBlockSizeBytes)); + options.num_levels = 2; + options.target_file_size_base = kL0FileBytes; + options.target_file_size_multiplier = 2; + options.write_buffer_size = kL0FileBytes; + BlockBasedTableOptions table_options; + table_options.block_size = kBlockSizeBytes; + std::vector compression_types; + if (Zlib_Supported()) { + compression_types.push_back(kZlibCompression); + } +#if LZ4_VERSION_NUMBER >= 10400 // r124+ + compression_types.push_back(kLZ4Compression); + compression_types.push_back(kLZ4HCCompression); +#endif // LZ4_VERSION_NUMBER >= 10400 + if (ZSTD_Supported()) { + compression_types.push_back(kZSTD); + } + + enum DictionaryTypes : int { + kWithoutDict, + kWithDict, + kWithZSTDTrainedDict, + kDictEnd, + }; + + for (auto compression_type : compression_types) { + options.compression = compression_type; + size_t bytes_without_dict = 0; + size_t bytes_with_dict = 0; + size_t bytes_with_zstd_trained_dict = 0; + for (int i = kWithoutDict; i < kDictEnd; i++) { + // First iteration: compress without preset dictionary + // Second iteration: compress with preset dictionary + // Third iteration (zstd only): compress with zstd-trained dictionary + // + // To make sure the compression dictionary has the intended effect, we + // verify the compressed size is smaller in successive iterations. Also in + // the non-first iterations, verify the data we get out is the same data + // we put in. + switch (i) { + case kWithoutDict: + options.compression_opts.max_dict_bytes = 0; + options.compression_opts.zstd_max_train_bytes = 0; + break; + case kWithDict: + options.compression_opts.max_dict_bytes = kBlockSizeBytes; + options.compression_opts.zstd_max_train_bytes = 0; + break; + case kWithZSTDTrainedDict: + if (compression_type != kZSTD) { + continue; + } + options.compression_opts.max_dict_bytes = kBlockSizeBytes; + options.compression_opts.zstd_max_train_bytes = kL0FileBytes; + break; + default: + assert(false); + } + + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"pikachu"}, options); + Random rnd(301); + std::string seq_datas[10]; + for (int j = 0; j < 10; ++j) { + seq_datas[j] = + RandomString(&rnd, kBlockSizeBytes - kApproxPerBlockOverheadBytes); + } + + ASSERT_EQ(0, NumTableFilesAtLevel(0, 1)); + for (int j = 0; j < kNumL0Files; ++j) { + for (size_t k = 0; k < kL0FileBytes / kBlockSizeBytes + 1; ++k) { + auto key_num = j * (kL0FileBytes / kBlockSizeBytes) + k; + ASSERT_OK(Put(1, Key(static_cast(key_num)), + seq_datas[(key_num / 10) % 10])); + } + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + ASSERT_EQ(j + 1, NumTableFilesAtLevel(0, 1)); + } + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1], + true /* disallow_trivial_move */); + ASSERT_EQ(0, NumTableFilesAtLevel(0, 1)); + ASSERT_GT(NumTableFilesAtLevel(1, 1), 0); + + // Get the live sst files size + size_t total_sst_bytes = TotalSize(1); + if (i == kWithoutDict) { + bytes_without_dict = total_sst_bytes; + } else if (i == kWithDict) { + bytes_with_dict = total_sst_bytes; + } else if (i == kWithZSTDTrainedDict) { + bytes_with_zstd_trained_dict = total_sst_bytes; + } + + for (size_t j = 0; j < kNumL0Files * (kL0FileBytes / kBlockSizeBytes); + j++) { + ASSERT_EQ(seq_datas[(j / 10) % 10], Get(1, Key(static_cast(j)))); + } + if (i == kWithDict) { + ASSERT_GT(bytes_without_dict, bytes_with_dict); + } else if (i == kWithZSTDTrainedDict) { + // In zstd compression, it is sometimes possible that using a trained + // dictionary does not get as good a compression ratio as without + // training. + // But using a dictionary (with or without training) should always get + // better compression ratio than not using one. + ASSERT_TRUE(bytes_with_dict > bytes_with_zstd_trained_dict || + bytes_without_dict > bytes_with_zstd_trained_dict); + } + + DestroyAndReopen(options); + } + } +} + +TEST_F(DBTest2, PresetCompressionDictLocality) { + if (!ZSTD_Supported()) { + return; + } + // Verifies that compression dictionary is generated from local data. The + // verification simply checks all output SSTs have different compression + // dictionaries. We do not verify effectiveness as that'd likely be flaky in + // the future. + const int kNumEntriesPerFile = 1 << 10; // 1KB + const int kNumBytesPerEntry = 1 << 10; // 1KB + const int kNumFiles = 4; + Options options = CurrentOptions(); + options.compression = kZSTD; + options.compression_opts.max_dict_bytes = 1 << 14; // 16KB + options.compression_opts.zstd_max_train_bytes = 1 << 18; // 256KB + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.target_file_size_base = kNumEntriesPerFile * kNumBytesPerEntry; + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + Reopen(options); + + Random rnd(301); + for (int i = 0; i < kNumFiles; ++i) { + for (int j = 0; j < kNumEntriesPerFile; ++j) { + ASSERT_OK(Put(Key(i * kNumEntriesPerFile + j), + RandomString(&rnd, kNumBytesPerEntry))); + } + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + ASSERT_EQ(NumTableFilesAtLevel(1), i + 1); + } + + // Store all the dictionaries generated during a full compaction. + std::vector compression_dicts; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict", + [&](void* arg) { + compression_dicts.emplace_back(static_cast(arg)->ToString()); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + CompactRangeOptions compact_range_opts; + compact_range_opts.bottommost_level_compaction = + BottommostLevelCompaction::kForceOptimized; + ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr)); + + // Dictionary compression should not be so good as to compress four totally + // random files into one. If it does then there's probably something wrong + // with the test. + ASSERT_GT(NumTableFilesAtLevel(1), 1); + + // Furthermore, there should be one compression dictionary generated per file. + // And they should all be different from each other. + ASSERT_EQ(NumTableFilesAtLevel(1), + static_cast(compression_dicts.size())); + for (size_t i = 1; i < compression_dicts.size(); ++i) { + std::string& a = compression_dicts[i - 1]; + std::string& b = compression_dicts[i]; + size_t alen = a.size(); + size_t blen = b.size(); + ASSERT_TRUE(alen != blen || memcmp(a.data(), b.data(), alen) != 0); + } +} + +class CompactionCompressionListener : public EventListener { + public: + explicit CompactionCompressionListener(Options* db_options) + : db_options_(db_options) {} + + void OnCompactionCompleted(DB* db, const CompactionJobInfo& ci) override { + // Figure out last level with files + int bottommost_level = 0; + for (int level = 0; level < db->NumberLevels(); level++) { + std::string files_at_level; + ASSERT_TRUE( + db->GetProperty("rocksdb.num-files-at-level" + NumberToString(level), + &files_at_level)); + if (files_at_level != "0") { + bottommost_level = level; + } + } + + if (db_options_->bottommost_compression != kDisableCompressionOption && + ci.output_level == bottommost_level) { + ASSERT_EQ(ci.compression, db_options_->bottommost_compression); + } else if (db_options_->compression_per_level.size() != 0) { + ASSERT_EQ(ci.compression, + db_options_->compression_per_level[ci.output_level]); + } else { + ASSERT_EQ(ci.compression, db_options_->compression); + } + max_level_checked = std::max(max_level_checked, ci.output_level); + } + + int max_level_checked = 0; + const Options* db_options_; +}; + +TEST_F(DBTest2, CompressionOptions) { + if (!Zlib_Supported() || !Snappy_Supported()) { + return; + } + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 2; + options.max_bytes_for_level_base = 100; + options.max_bytes_for_level_multiplier = 2; + options.num_levels = 7; + options.max_background_compactions = 1; + + CompactionCompressionListener* listener = + new CompactionCompressionListener(&options); + options.listeners.emplace_back(listener); + + const int kKeySize = 5; + const int kValSize = 20; + Random rnd(301); + + for (int iter = 0; iter <= 2; iter++) { + listener->max_level_checked = 0; + + if (iter == 0) { + // Use different compression algorithms for different levels but + // always use Zlib for bottommost level + options.compression_per_level = {kNoCompression, kNoCompression, + kNoCompression, kSnappyCompression, + kSnappyCompression, kSnappyCompression, + kZlibCompression}; + options.compression = kNoCompression; + options.bottommost_compression = kZlibCompression; + } else if (iter == 1) { + // Use Snappy except for bottommost level use ZLib + options.compression_per_level = {}; + options.compression = kSnappyCompression; + options.bottommost_compression = kZlibCompression; + } else if (iter == 2) { + // Use Snappy everywhere + options.compression_per_level = {}; + options.compression = kSnappyCompression; + options.bottommost_compression = kDisableCompressionOption; + } + + DestroyAndReopen(options); + // Write 10 random files + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 5; j++) { + ASSERT_OK( + Put(RandomString(&rnd, kKeySize), RandomString(&rnd, kValSize))); + } + ASSERT_OK(Flush()); + dbfull()->TEST_WaitForCompact(); + } + + // Make sure that we wrote enough to check all 7 levels + ASSERT_EQ(listener->max_level_checked, 6); + } +} + +class CompactionStallTestListener : public EventListener { + public: + CompactionStallTestListener() : compacting_files_cnt_(0), compacted_files_cnt_(0) {} + + void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override { + ASSERT_EQ(ci.cf_name, "default"); + ASSERT_EQ(ci.base_input_level, 0); + ASSERT_EQ(ci.compaction_reason, CompactionReason::kLevelL0FilesNum); + compacting_files_cnt_ += ci.input_files.size(); + } + + void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override { + ASSERT_EQ(ci.cf_name, "default"); + ASSERT_EQ(ci.base_input_level, 0); + ASSERT_EQ(ci.compaction_reason, CompactionReason::kLevelL0FilesNum); + compacted_files_cnt_ += ci.input_files.size(); + } + + std::atomic compacting_files_cnt_; + std::atomic compacted_files_cnt_; +}; + +TEST_F(DBTest2, CompactionStall) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BGWorkCompaction", "DBTest2::CompactionStall:0"}, + {"DBImpl::BGWorkCompaction", "DBTest2::CompactionStall:1"}, + {"DBTest2::CompactionStall:2", + "DBImpl::NotifyOnCompactionBegin::UnlockMutex"}, + {"DBTest2::CompactionStall:3", + "DBImpl::NotifyOnCompactionCompleted::UnlockMutex"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 4; + options.max_background_compactions = 40; + CompactionStallTestListener* listener = new CompactionStallTestListener(); + options.listeners.emplace_back(listener); + DestroyAndReopen(options); + // make sure all background compaction jobs can be scheduled + auto stop_token = + dbfull()->TEST_write_controler().GetCompactionPressureToken(); + + Random rnd(301); + + // 4 Files in L0 + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 10; j++) { + ASSERT_OK(Put(RandomString(&rnd, 10), RandomString(&rnd, 10))); + } + ASSERT_OK(Flush()); + } + + // Wait for compaction to be triggered + TEST_SYNC_POINT("DBTest2::CompactionStall:0"); + + // Clear "DBImpl::BGWorkCompaction" SYNC_POINT since we want to hold it again + // at DBTest2::CompactionStall::1 + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace(); + + // Another 6 L0 files to trigger compaction again + for (int i = 0; i < 6; i++) { + for (int j = 0; j < 10; j++) { + ASSERT_OK(Put(RandomString(&rnd, 10), RandomString(&rnd, 10))); + } + ASSERT_OK(Flush()); + } + + // Wait for another compaction to be triggered + TEST_SYNC_POINT("DBTest2::CompactionStall:1"); + + // Hold NotifyOnCompactionBegin in the unlock mutex section + TEST_SYNC_POINT("DBTest2::CompactionStall:2"); + + // Hold NotifyOnCompactionCompleted in the unlock mutex section + TEST_SYNC_POINT("DBTest2::CompactionStall:3"); + + dbfull()->TEST_WaitForCompact(); + ASSERT_LT(NumTableFilesAtLevel(0), + options.level0_file_num_compaction_trigger); + ASSERT_GT(listener->compacted_files_cnt_.load(), + 10 - options.level0_file_num_compaction_trigger); + ASSERT_EQ(listener->compacting_files_cnt_.load(), listener->compacted_files_cnt_.load()); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +#endif // ROCKSDB_LITE + +TEST_F(DBTest2, FirstSnapshotTest) { + Options options; + options.write_buffer_size = 100000; // Small write buffer + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // This snapshot will have sequence number 0 what is expected behaviour. + const Snapshot* s1 = db_->GetSnapshot(); + + Put(1, "k1", std::string(100000, 'x')); // Fill memtable + Put(1, "k2", std::string(100000, 'y')); // Trigger flush + + db_->ReleaseSnapshot(s1); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBTest2, DuplicateSnapshot) { + Options options; + options = CurrentOptions(options); + std::vector snapshots; + DBImpl* dbi = reinterpret_cast(db_); + SequenceNumber oldest_ww_snap, first_ww_snap; + + Put("k", "v"); // inc seq + snapshots.push_back(db_->GetSnapshot()); + snapshots.push_back(db_->GetSnapshot()); + Put("k", "v"); // inc seq + snapshots.push_back(db_->GetSnapshot()); + snapshots.push_back(dbi->GetSnapshotForWriteConflictBoundary()); + first_ww_snap = snapshots.back()->GetSequenceNumber(); + Put("k", "v"); // inc seq + snapshots.push_back(dbi->GetSnapshotForWriteConflictBoundary()); + snapshots.push_back(db_->GetSnapshot()); + Put("k", "v"); // inc seq + snapshots.push_back(db_->GetSnapshot()); + + { + InstrumentedMutexLock l(dbi->mutex()); + auto seqs = dbi->snapshots().GetAll(&oldest_ww_snap); + ASSERT_EQ(seqs.size(), 4); // duplicates are not counted + ASSERT_EQ(oldest_ww_snap, first_ww_snap); + } + + for (auto s : snapshots) { + db_->ReleaseSnapshot(s); + } +} +#endif // ROCKSDB_LITE + +class PinL0IndexAndFilterBlocksTest + : public DBTestBase, + public testing::WithParamInterface> { + public: + PinL0IndexAndFilterBlocksTest() : DBTestBase("/db_pin_l0_index_bloom_test") {} + void SetUp() override { + infinite_max_files_ = std::get<0>(GetParam()); + disallow_preload_ = std::get<1>(GetParam()); + } + + void CreateTwoLevels(Options* options, bool close_afterwards) { + if (infinite_max_files_) { + options->max_open_files = -1; + } + options->create_if_missing = true; + options->statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.pin_l0_filter_and_index_blocks_in_cache = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(20)); + options->table_factory.reset(new BlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"pikachu"}, *options); + + Put(1, "a", "begin"); + Put(1, "z", "end"); + ASSERT_OK(Flush(1)); + // move this table to L1 + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + + // reset block cache + table_options.block_cache = NewLRUCache(64 * 1024); + options->table_factory.reset(NewBlockBasedTableFactory(table_options)); + TryReopenWithColumnFamilies({"default", "pikachu"}, *options); + // create new table at L0 + Put(1, "a2", "begin2"); + Put(1, "z2", "end2"); + ASSERT_OK(Flush(1)); + + if (close_afterwards) { + Close(); // This ensures that there is no ref to block cache entries + } + table_options.block_cache->EraseUnRefEntries(); + } + + bool infinite_max_files_; + bool disallow_preload_; +}; + +TEST_P(PinL0IndexAndFilterBlocksTest, + IndexAndFilterBlocksOfNewTableAddedToCacheWithPinning) { + Options options = CurrentOptions(); + if (infinite_max_files_) { + options.max_open_files = -1; + } + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.pin_l0_filter_and_index_blocks_in_cache = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(20)); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "key", "val")); + // Create a new table. + ASSERT_OK(Flush(1)); + + // index/filter blocks added to block cache right after table creation. + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + + // only index/filter were added + ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS)); + + std::string value; + // Miss and hit count should remain the same, they're all pinned. + db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + + // Miss and hit count should remain the same, they're all pinned. + value = Get(1, "key"); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); +} + +TEST_P(PinL0IndexAndFilterBlocksTest, + MultiLevelIndexAndFilterBlocksCachedWithPinning) { + Options options = CurrentOptions(); + PinL0IndexAndFilterBlocksTest::CreateTwoLevels(&options, false); + // get base cache values + uint64_t fm = TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS); + uint64_t fh = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); + uint64_t im = TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS); + uint64_t ih = TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT); + + std::string value; + // this should be read from L0 + // so cache values don't change + value = Get(1, "a2"); + ASSERT_EQ(fm, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(im, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + + // this should be read from L1 + // the file is opened, prefetching results in a cache filter miss + // the block is loaded and added to the cache, + // then the get results in a cache hit for L1 + // When we have inifinite max_files, there is still cache miss because we have + // reset the block cache + value = Get(1, "a"); + ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(im + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); +} + +TEST_P(PinL0IndexAndFilterBlocksTest, DisablePrefetchingNonL0IndexAndFilter) { + Options options = CurrentOptions(); + // This ensures that db does not ref anything in the block cache, so + // EraseUnRefEntries could clear them up. + bool close_afterwards = true; + PinL0IndexAndFilterBlocksTest::CreateTwoLevels(&options, close_afterwards); + + // Get base cache values + uint64_t fm = TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS); + uint64_t fh = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); + uint64_t im = TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS); + uint64_t ih = TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT); + + if (disallow_preload_) { + // Now we have two files. We narrow the max open files to allow 3 entries + // so that preloading SST files won't happen. + options.max_open_files = 13; + // RocksDB sanitize max open files to at least 20. Modify it back. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) { + int* max_open_files = static_cast(arg); + *max_open_files = 13; + }); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Reopen database. If max_open_files is set as -1, table readers will be + // preloaded. This will trigger a BlockBasedTable::Open() and prefetch + // L0 index and filter. Level 1's prefetching is disabled in DB::Open() + TryReopenWithColumnFamilies({"default", "pikachu"}, options); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + if (!disallow_preload_) { + // After reopen, cache miss are increased by one because we read (and only + // read) filter and index on L0 + ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(im + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + } else { + // If max_open_files is not -1, we do not preload table readers, so there is + // no change. + ASSERT_EQ(fm, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(im, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + } + std::string value; + // this should be read from L0 + value = Get(1, "a2"); + // If max_open_files is -1, we have pinned index and filter in Rep, so there + // will not be changes in index and filter misses or hits. If max_open_files + // is not -1, Get() will open a TableReader and prefetch index and filter. + ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(im + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + + // this should be read from L1 + value = Get(1, "a"); + if (!disallow_preload_) { + // In inifinite max files case, there's a cache miss in executing Get() + // because index and filter are not prefetched before. + ASSERT_EQ(fm + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(im + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + } else { + // In this case, cache miss will be increased by one in + // BlockBasedTable::Open() because this is not in DB::Open() code path so we + // will prefetch L1's index and filter. Cache hit will also be increased by + // one because Get() will read index and filter from the block cache + // prefetched in previous Open() call. + ASSERT_EQ(fm + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(im + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(ih + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + } + + // Force a full compaction to one single file. There will be a block + // cache read for both of index and filter. If prefetch doesn't explicitly + // happen, it will happen when verifying the file. + Compact(1, "a", "zzzzz"); + dbfull()->TEST_WaitForCompact(); + + if (!disallow_preload_) { + ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(ih + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + } else { + ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + } + + // Bloom and index hit will happen when a Get() happens. + value = Get(1, "a"); + if (!disallow_preload_) { + ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + } else { + ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(fh + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + } +} + +INSTANTIATE_TEST_CASE_P(PinL0IndexAndFilterBlocksTest, + PinL0IndexAndFilterBlocksTest, + ::testing::Values(std::make_tuple(true, false), + std::make_tuple(false, false), + std::make_tuple(false, true))); + +#ifndef ROCKSDB_LITE +TEST_F(DBTest2, MaxCompactionBytesTest) { + Options options = CurrentOptions(); + options.memtable_factory.reset( + new SpecialSkipListFactory(DBTestBase::kNumKeysByGenerateNewRandomFile)); + options.compaction_style = kCompactionStyleLevel; + options.write_buffer_size = 200 << 10; + options.arena_block_size = 4 << 10; + options.level0_file_num_compaction_trigger = 4; + options.num_levels = 4; + options.compression = kNoCompression; + options.max_bytes_for_level_base = 450 << 10; + options.target_file_size_base = 100 << 10; + // Infinite for full compaction. + options.max_compaction_bytes = options.target_file_size_base * 100; + + Reopen(options); + + Random rnd(301); + + for (int num = 0; num < 8; num++) { + GenerateNewRandomFile(&rnd); + } + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,8", FilesPerLevel(0)); + + // When compact from Ln -> Ln+1, cut a file if the file overlaps with + // more than three files in Ln+1. + options.max_compaction_bytes = options.target_file_size_base * 3; + Reopen(options); + + GenerateNewRandomFile(&rnd); + // Add three more small files that overlap with the previous file + for (int i = 0; i < 3; i++) { + Put("a", "z"); + ASSERT_OK(Flush()); + } + dbfull()->TEST_WaitForCompact(); + + // Output files to L1 are cut to three pieces, according to + // options.max_compaction_bytes + ASSERT_EQ("0,3,8", FilesPerLevel(0)); +} + +static void UniqueIdCallback(void* arg) { + int* result = reinterpret_cast(arg); + if (*result == -1) { + *result = 0; + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "GetUniqueIdFromFile:FS_IOC_GETVERSION", UniqueIdCallback); +} + +class MockPersistentCache : public PersistentCache { + public: + explicit MockPersistentCache(const bool is_compressed, const size_t max_size) + : is_compressed_(is_compressed), max_size_(max_size) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "GetUniqueIdFromFile:FS_IOC_GETVERSION", UniqueIdCallback); + } + + ~MockPersistentCache() override {} + + PersistentCache::StatsType Stats() override { + return PersistentCache::StatsType(); + } + + Status Insert(const Slice& page_key, const char* data, + const size_t size) override { + MutexLock _(&lock_); + + if (size_ > max_size_) { + size_ -= data_.begin()->second.size(); + data_.erase(data_.begin()); + } + + data_.insert(std::make_pair(page_key.ToString(), std::string(data, size))); + size_ += size; + return Status::OK(); + } + + Status Lookup(const Slice& page_key, std::unique_ptr* data, + size_t* size) override { + MutexLock _(&lock_); + auto it = data_.find(page_key.ToString()); + if (it == data_.end()) { + return Status::NotFound(); + } + + assert(page_key.ToString() == it->first); + data->reset(new char[it->second.size()]); + memcpy(data->get(), it->second.c_str(), it->second.size()); + *size = it->second.size(); + return Status::OK(); + } + + bool IsCompressed() override { return is_compressed_; } + + std::string GetPrintableOptions() const override { + return "MockPersistentCache"; + } + + port::Mutex lock_; + std::map data_; + const bool is_compressed_ = true; + size_t size_ = 0; + const size_t max_size_ = 10 * 1024; // 10KiB +}; + +#ifdef OS_LINUX +// Make sure that in CPU time perf context counters, Env::NowCPUNanos() +// is used, rather than Env::CPUNanos(); +TEST_F(DBTest2, TestPerfContextGetCpuTime) { + // force resizing table cache so table handle is not preloaded so that + // we can measure find_table_nanos during Get(). + dbfull()->TEST_table_cache()->SetCapacity(0); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + env_->now_cpu_count_.store(0); + + // CPU timing is not enabled with kEnableTimeExceptForMutex + SetPerfLevel(PerfLevel::kEnableTimeExceptForMutex); + ASSERT_EQ("bar", Get("foo")); + ASSERT_EQ(0, get_perf_context()->get_cpu_nanos); + ASSERT_EQ(0, env_->now_cpu_count_.load()); + + uint64_t kDummyAddonTime = uint64_t{1000000000000}; + + // Add time to NowNanos() reading. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "TableCache::FindTable:0", + [&](void* /*arg*/) { env_->addon_time_.fetch_add(kDummyAddonTime); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); + ASSERT_EQ("bar", Get("foo")); + ASSERT_GT(env_->now_cpu_count_.load(), 2); + ASSERT_LT(get_perf_context()->get_cpu_nanos, kDummyAddonTime); + ASSERT_GT(get_perf_context()->find_table_nanos, kDummyAddonTime); + + SetPerfLevel(PerfLevel::kDisable); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTest2, TestPerfContextIterCpuTime) { + DestroyAndReopen(CurrentOptions()); + // force resizing table cache so table handle is not preloaded so that + // we can measure find_table_nanos during iteration + dbfull()->TEST_table_cache()->SetCapacity(0); + + const size_t kNumEntries = 10; + for (size_t i = 0; i < kNumEntries; ++i) { + ASSERT_OK(Put("k" + ToString(i), "v" + ToString(i))); + } + ASSERT_OK(Flush()); + for (size_t i = 0; i < kNumEntries; ++i) { + ASSERT_EQ("v" + ToString(i), Get("k" + ToString(i))); + } + std::string last_key = "k" + ToString(kNumEntries - 1); + std::string last_value = "v" + ToString(kNumEntries - 1); + env_->now_cpu_count_.store(0); + + // CPU timing is not enabled with kEnableTimeExceptForMutex + SetPerfLevel(PerfLevel::kEnableTimeExceptForMutex); + Iterator* iter = db_->NewIterator(ReadOptions()); + iter->Seek("k0"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("v0", iter->value().ToString()); + iter->SeekForPrev(last_key); + ASSERT_TRUE(iter->Valid()); + iter->SeekToLast(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(last_value, iter->value().ToString()); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("v0", iter->value().ToString()); + ASSERT_EQ(0, get_perf_context()->iter_seek_cpu_nanos); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("v1", iter->value().ToString()); + ASSERT_EQ(0, get_perf_context()->iter_next_cpu_nanos); + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("v0", iter->value().ToString()); + ASSERT_EQ(0, get_perf_context()->iter_prev_cpu_nanos); + ASSERT_EQ(0, env_->now_cpu_count_.load()); + delete iter; + + uint64_t kDummyAddonTime = uint64_t{1000000000000}; + + // Add time to NowNanos() reading. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "TableCache::FindTable:0", + [&](void* /*arg*/) { env_->addon_time_.fetch_add(kDummyAddonTime); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); + iter = db_->NewIterator(ReadOptions()); + iter->Seek("k0"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("v0", iter->value().ToString()); + iter->SeekForPrev(last_key); + ASSERT_TRUE(iter->Valid()); + iter->SeekToLast(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(last_value, iter->value().ToString()); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("v0", iter->value().ToString()); + ASSERT_GT(get_perf_context()->iter_seek_cpu_nanos, 0); + ASSERT_LT(get_perf_context()->iter_seek_cpu_nanos, kDummyAddonTime); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("v1", iter->value().ToString()); + ASSERT_GT(get_perf_context()->iter_next_cpu_nanos, 0); + ASSERT_LT(get_perf_context()->iter_next_cpu_nanos, kDummyAddonTime); + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("v0", iter->value().ToString()); + ASSERT_GT(get_perf_context()->iter_prev_cpu_nanos, 0); + ASSERT_LT(get_perf_context()->iter_prev_cpu_nanos, kDummyAddonTime); + ASSERT_GE(env_->now_cpu_count_.load(), 12); + ASSERT_GT(get_perf_context()->find_table_nanos, kDummyAddonTime); + + SetPerfLevel(PerfLevel::kDisable); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + delete iter; +} +#endif // OS_LINUX + +// GetUniqueIdFromFile is not implemented on these platforms. Persistent cache +// breaks when that function is not implemented and no regular block cache is +// provided. +#if !defined(OS_SOLARIS) && !defined(OS_WIN) +TEST_F(DBTest2, PersistentCache) { + int num_iter = 80; + + Options options; + options.write_buffer_size = 64 * 1024; // small write buffer + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options = CurrentOptions(options); + + auto bsizes = {/*no block cache*/ 0, /*1M*/ 1 * 1024 * 1024}; + auto types = {/*compressed*/ 1, /*uncompressed*/ 0}; + for (auto bsize : bsizes) { + for (auto type : types) { + BlockBasedTableOptions table_options; + table_options.persistent_cache.reset( + new MockPersistentCache(type, 10 * 1024)); + table_options.no_block_cache = true; + table_options.block_cache = bsize ? NewLRUCache(bsize) : nullptr; + table_options.block_cache_compressed = nullptr; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + // default column family doesn't have block cache + Options no_block_cache_opts; + no_block_cache_opts.statistics = options.statistics; + no_block_cache_opts = CurrentOptions(no_block_cache_opts); + BlockBasedTableOptions table_options_no_bc; + table_options_no_bc.no_block_cache = true; + no_block_cache_opts.table_factory.reset( + NewBlockBasedTableFactory(table_options_no_bc)); + ReopenWithColumnFamilies( + {"default", "pikachu"}, + std::vector({no_block_cache_opts, options})); + + Random rnd(301); + + // Write 8MB (80 values, each 100K) + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + std::vector values; + std::string str; + for (int i = 0; i < num_iter; i++) { + if (i % 4 == 0) { // high compression ratio + str = RandomString(&rnd, 1000); + } + values.push_back(str); + ASSERT_OK(Put(1, Key(i), values[i])); + } + + // flush all data from memtable so that reads are from block cache + ASSERT_OK(Flush(1)); + + for (int i = 0; i < num_iter; i++) { + ASSERT_EQ(Get(1, Key(i)), values[i]); + } + + auto hit = options.statistics->getTickerCount(PERSISTENT_CACHE_HIT); + auto miss = options.statistics->getTickerCount(PERSISTENT_CACHE_MISS); + + ASSERT_GT(hit, 0); + ASSERT_GT(miss, 0); + } + } +} +#endif // !defined(OS_SOLARIS) && !defined(OS_WIN) + +namespace { +void CountSyncPoint() { + TEST_SYNC_POINT_CALLBACK("DBTest2::MarkedPoint", nullptr /* arg */); +} +} // namespace + +TEST_F(DBTest2, SyncPointMarker) { + std::atomic sync_point_called(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBTest2::MarkedPoint", + [&](void* /*arg*/) { sync_point_called.fetch_add(1); }); + + // The first dependency enforces Marker can be loaded before MarkedPoint. + // The second checks that thread 1's MarkedPoint should be disabled here. + // Execution order: + // | Thread 1 | Thread 2 | + // | | Marker | + // | MarkedPoint | | + // | Thread1First | | + // | | MarkedPoint | + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependencyAndMarkers( + {{"DBTest2::SyncPointMarker:Thread1First", "DBTest2::MarkedPoint"}}, + {{"DBTest2::SyncPointMarker:Marker", "DBTest2::MarkedPoint"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + std::function func1 = [&]() { + CountSyncPoint(); + TEST_SYNC_POINT("DBTest2::SyncPointMarker:Thread1First"); + }; + + std::function func2 = [&]() { + TEST_SYNC_POINT("DBTest2::SyncPointMarker:Marker"); + CountSyncPoint(); + }; + + auto thread1 = port::Thread(func1); + auto thread2 = port::Thread(func2); + thread1.join(); + thread2.join(); + + // Callback is only executed once + ASSERT_EQ(sync_point_called.load(), 1); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} +#endif + +size_t GetEncodedEntrySize(size_t key_size, size_t value_size) { + std::string buffer; + + PutVarint32(&buffer, static_cast(0)); + PutVarint32(&buffer, static_cast(key_size)); + PutVarint32(&buffer, static_cast(value_size)); + + return buffer.size() + key_size + value_size; +} + +TEST_F(DBTest2, ReadAmpBitmap) { + Options options = CurrentOptions(); + BlockBasedTableOptions bbto; + uint32_t bytes_per_bit[2] = {1, 16}; + for (size_t k = 0; k < 2; k++) { + // Disable delta encoding to make it easier to calculate read amplification + bbto.use_delta_encoding = false; + // Huge block cache to make it easier to calculate read amplification + bbto.block_cache = NewLRUCache(1024 * 1024 * 1024); + bbto.read_amp_bytes_per_bit = bytes_per_bit[k]; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + DestroyAndReopen(options); + + const size_t kNumEntries = 10000; + + Random rnd(301); + for (size_t i = 0; i < kNumEntries; i++) { + ASSERT_OK(Put(Key(static_cast(i)), RandomString(&rnd, 100))); + } + ASSERT_OK(Flush()); + + Close(); + Reopen(options); + + // Read keys/values randomly and verify that reported read amp error + // is less than 2% + uint64_t total_useful_bytes = 0; + std::set read_keys; + std::string value; + for (size_t i = 0; i < kNumEntries * 5; i++) { + int key_idx = rnd.Next() % kNumEntries; + std::string key = Key(key_idx); + ASSERT_OK(db_->Get(ReadOptions(), key, &value)); + + if (read_keys.find(key_idx) == read_keys.end()) { + auto internal_key = InternalKey(key, 0, ValueType::kTypeValue); + total_useful_bytes += + GetEncodedEntrySize(internal_key.size(), value.size()); + read_keys.insert(key_idx); + } + + double expected_read_amp = + static_cast(total_useful_bytes) / + options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES); + + double read_amp = + static_cast(options.statistics->getTickerCount( + READ_AMP_ESTIMATE_USEFUL_BYTES)) / + options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES); + + double error_pct = fabs(expected_read_amp - read_amp) * 100; + // Error between reported read amp and real read amp should be less than + // 2% + EXPECT_LE(error_pct, 2); + } + + // Make sure we read every thing in the DB (which is smaller than our cache) + Iterator* iter = db_->NewIterator(ReadOptions()); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_EQ(iter->value().ToString(), Get(iter->key().ToString())); + } + delete iter; + + // Read amp is on average 100% since we read all what we loaded in memory + if (k == 0) { + ASSERT_EQ( + options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES), + options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES)); + } else { + ASSERT_NEAR( + options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES) * + 1.0f / + options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES), + 1, .01); + } + } +} + +#ifndef OS_SOLARIS // GetUniqueIdFromFile is not implemented +TEST_F(DBTest2, ReadAmpBitmapLiveInCacheAfterDBClose) { + { + const int kIdBufLen = 100; + char id_buf[kIdBufLen]; +#ifndef OS_WIN + // You can't open a directory on windows using random access file + std::unique_ptr file; + ASSERT_OK(env_->NewRandomAccessFile(dbname_, &file, EnvOptions())); + if (file->GetUniqueId(id_buf, kIdBufLen) == 0) { + // fs holding db directory doesn't support getting a unique file id, + // this means that running this test will fail because lru_cache will load + // the blocks again regardless of them being already in the cache + return; + } +#else + std::unique_ptr dir; + ASSERT_OK(env_->NewDirectory(dbname_, &dir)); + if (dir->GetUniqueId(id_buf, kIdBufLen) == 0) { + // fs holding db directory doesn't support getting a unique file id, + // this means that running this test will fail because lru_cache will load + // the blocks again regardless of them being already in the cache + return; + } +#endif + } + uint32_t bytes_per_bit[2] = {1, 16}; + for (size_t k = 0; k < 2; k++) { + std::shared_ptr lru_cache = NewLRUCache(1024 * 1024 * 1024); + std::shared_ptr stats = ROCKSDB_NAMESPACE::CreateDBStatistics(); + + Options options = CurrentOptions(); + BlockBasedTableOptions bbto; + // Disable delta encoding to make it easier to calculate read amplification + bbto.use_delta_encoding = false; + // Huge block cache to make it easier to calculate read amplification + bbto.block_cache = lru_cache; + bbto.read_amp_bytes_per_bit = bytes_per_bit[k]; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options.statistics = stats; + DestroyAndReopen(options); + + const int kNumEntries = 10000; + + Random rnd(301); + for (int i = 0; i < kNumEntries; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 100))); + } + ASSERT_OK(Flush()); + + Close(); + Reopen(options); + + uint64_t total_useful_bytes = 0; + std::set read_keys; + std::string value; + // Iter1: Read half the DB, Read even keys + // Key(0), Key(2), Key(4), Key(6), Key(8), ... + for (int i = 0; i < kNumEntries; i += 2) { + std::string key = Key(i); + ASSERT_OK(db_->Get(ReadOptions(), key, &value)); + + if (read_keys.find(i) == read_keys.end()) { + auto internal_key = InternalKey(key, 0, ValueType::kTypeValue); + total_useful_bytes += + GetEncodedEntrySize(internal_key.size(), value.size()); + read_keys.insert(i); + } + } + + size_t total_useful_bytes_iter1 = + options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES); + size_t total_loaded_bytes_iter1 = + options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES); + + Close(); + std::shared_ptr new_statistics = + ROCKSDB_NAMESPACE::CreateDBStatistics(); + // Destroy old statistics obj that the blocks in lru_cache are pointing to + options.statistics.reset(); + // Use the statistics object that we just created + options.statistics = new_statistics; + Reopen(options); + + // Iter2: Read half the DB, Read odd keys + // Key(1), Key(3), Key(5), Key(7), Key(9), ... + for (int i = 1; i < kNumEntries; i += 2) { + std::string key = Key(i); + ASSERT_OK(db_->Get(ReadOptions(), key, &value)); + + if (read_keys.find(i) == read_keys.end()) { + auto internal_key = InternalKey(key, 0, ValueType::kTypeValue); + total_useful_bytes += + GetEncodedEntrySize(internal_key.size(), value.size()); + read_keys.insert(i); + } + } + + size_t total_useful_bytes_iter2 = + options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES); + size_t total_loaded_bytes_iter2 = + options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES); + + + // Read amp is on average 100% since we read all what we loaded in memory + if (k == 0) { + ASSERT_EQ(total_useful_bytes_iter1 + total_useful_bytes_iter2, + total_loaded_bytes_iter1 + total_loaded_bytes_iter2); + } else { + ASSERT_NEAR((total_useful_bytes_iter1 + total_useful_bytes_iter2) * 1.0f / + (total_loaded_bytes_iter1 + total_loaded_bytes_iter2), + 1, .01); + } + } +} +#endif // !OS_SOLARIS + +#ifndef ROCKSDB_LITE +TEST_F(DBTest2, AutomaticCompactionOverlapManualCompaction) { + Options options = CurrentOptions(); + options.num_levels = 3; + options.IncreaseParallelism(20); + DestroyAndReopen(options); + + ASSERT_OK(Put(Key(0), "a")); + ASSERT_OK(Put(Key(5), "a")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(Key(10), "a")); + ASSERT_OK(Put(Key(15), "a")); + ASSERT_OK(Flush()); + + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 2; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + auto get_stat = [](std::string level_str, LevelStatType type, + std::map props) { + auto prop_str = + "compaction." + level_str + "." + + InternalStats::compaction_level_stats.at(type).property_name.c_str(); + auto prop_item = props.find(prop_str); + return prop_item == props.end() ? 0 : std::stod(prop_item->second); + }; + + // Trivial move 2 files to L2 + ASSERT_EQ("0,0,2", FilesPerLevel()); + // Also test that the stats GetMapProperty API reporting the same result + { + std::map prop; + ASSERT_TRUE(dbfull()->GetMapProperty("rocksdb.cfstats", &prop)); + ASSERT_EQ(0, get_stat("L0", LevelStatType::NUM_FILES, prop)); + ASSERT_EQ(0, get_stat("L1", LevelStatType::NUM_FILES, prop)); + ASSERT_EQ(2, get_stat("L2", LevelStatType::NUM_FILES, prop)); + ASSERT_EQ(2, get_stat("Sum", LevelStatType::NUM_FILES, prop)); + } + + // While the compaction is running, we will create 2 new files that + // can fit in L2, these 2 files will be moved to L2 and overlap with + // the running compaction and break the LSM consistency. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::Run():Start", [&](void* /*arg*/) { + ASSERT_OK( + dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "2"}, + {"max_bytes_for_level_base", "1"}})); + ASSERT_OK(Put(Key(6), "a")); + ASSERT_OK(Put(Key(7), "a")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(Key(8), "a")); + ASSERT_OK(Put(Key(9), "a")); + ASSERT_OK(Flush()); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Run a manual compaction that will compact the 2 files in L2 + // into 1 file in L2 + cro.exclusive_manual_compaction = false; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + // Test that the stats GetMapProperty API reporting 1 file in L2 + { + std::map prop; + ASSERT_TRUE(dbfull()->GetMapProperty("rocksdb.cfstats", &prop)); + ASSERT_EQ(1, get_stat("L2", LevelStatType::NUM_FILES, prop)); + } +} + +TEST_F(DBTest2, ManualCompactionOverlapManualCompaction) { + Options options = CurrentOptions(); + options.num_levels = 2; + options.IncreaseParallelism(20); + options.disable_auto_compactions = true; + DestroyAndReopen(options); + + ASSERT_OK(Put(Key(0), "a")); + ASSERT_OK(Put(Key(5), "a")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(Key(10), "a")); + ASSERT_OK(Put(Key(15), "a")); + ASSERT_OK(Flush()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + // Trivial move 2 files to L1 + ASSERT_EQ("0,2", FilesPerLevel()); + + std::function bg_manual_compact = [&]() { + std::string k1 = Key(6); + std::string k2 = Key(9); + Slice k1s(k1); + Slice k2s(k2); + CompactRangeOptions cro; + cro.exclusive_manual_compaction = false; + ASSERT_OK(db_->CompactRange(cro, &k1s, &k2s)); + }; + ROCKSDB_NAMESPACE::port::Thread bg_thread; + + // While the compaction is running, we will create 2 new files that + // can fit in L1, these 2 files will be moved to L1 and overlap with + // the running compaction and break the LSM consistency. + std::atomic flag(false); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::Run():Start", [&](void* /*arg*/) { + if (flag.exchange(true)) { + // We want to make sure to call this callback only once + return; + } + ASSERT_OK(Put(Key(6), "a")); + ASSERT_OK(Put(Key(7), "a")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(Key(8), "a")); + ASSERT_OK(Put(Key(9), "a")); + ASSERT_OK(Flush()); + + // Start a non-exclusive manual compaction in a bg thread + bg_thread = port::Thread(bg_manual_compact); + // This manual compaction conflict with the other manual compaction + // so it should wait until the first compaction finish + env_->SleepForMicroseconds(1000000); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Run a manual compaction that will compact the 2 files in L1 + // into 1 file in L1 + CompactRangeOptions cro; + cro.exclusive_manual_compaction = false; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + bg_thread.join(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTest2, PausingManualCompaction1) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.num_levels = 7; + + DestroyAndReopen(options); + Random rnd(301); + // Generate a file containing 10 keys. + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 50))); + } + ASSERT_OK(Flush()); + + // Generate another file containing same keys + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 50))); + } + ASSERT_OK(Flush()); + + int manual_compactions_paused = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::Run():PausingManualCompaction:1", [&](void* arg) { + auto paused = reinterpret_cast*>(arg); + ASSERT_FALSE(paused->load(std::memory_order_acquire)); + paused->store(true, std::memory_order_release); + manual_compactions_paused += 1; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + std::vector files_before_compact, files_after_compact; + // Remember file name before compaction is triggered + std::vector files_meta; + dbfull()->GetLiveFilesMetaData(&files_meta); + for (auto file : files_meta) { + files_before_compact.push_back(file.name); + } + + // OK, now trigger a manual compaction + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + + // Wait for compactions to get scheduled and stopped + dbfull()->TEST_WaitForCompact(true); + + // Get file names after compaction is stopped + files_meta.clear(); + dbfull()->GetLiveFilesMetaData(&files_meta); + for (auto file : files_meta) { + files_after_compact.push_back(file.name); + } + + // Like nothing happened + ASSERT_EQ(files_before_compact, files_after_compact); + ASSERT_EQ(manual_compactions_paused, 1); + + manual_compactions_paused = 0; + // Now make sure CompactFiles also not run + dbfull()->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), + files_before_compact, 0); + // Wait for manual compaction to get scheduled and finish + dbfull()->TEST_WaitForCompact(true); + + files_meta.clear(); + files_after_compact.clear(); + dbfull()->GetLiveFilesMetaData(&files_meta); + for (auto file : files_meta) { + files_after_compact.push_back(file.name); + } + + ASSERT_EQ(files_before_compact, files_after_compact); + // CompactFiles returns at entry point + ASSERT_EQ(manual_compactions_paused, 0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +// PausingManualCompaction does not affect auto compaction +TEST_F(DBTest2, PausingManualCompaction2) { + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 2; + options.disable_auto_compactions = false; + + DestroyAndReopen(options); + dbfull()->DisableManualCompaction(); + + Random rnd(301); + for (int i = 0; i < 2; i++) { + // Generate a file containing 10 keys. + for (int j = 0; j < 100; j++) { + ASSERT_OK(Put(Key(j), RandomString(&rnd, 50))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + + std::vector files_meta; + dbfull()->GetLiveFilesMetaData(&files_meta); + ASSERT_EQ(files_meta.size(), 1); +} + +TEST_F(DBTest2, PausingManualCompaction3) { + CompactRangeOptions compact_options; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.num_levels = 7; + + Random rnd(301); + auto generate_files = [&]() { + for (int i = 0; i < options.num_levels; i++) { + for (int j = 0; j < options.num_levels - i + 1; j++) { + for (int k = 0; k < 1000; k++) { + ASSERT_OK(Put(Key(k + j * 1000), RandomString(&rnd, 50))); + } + Flush(); + } + + for (int l = 1; l < options.num_levels - i; l++) { + MoveFilesToLevel(l); + } + } + }; + + DestroyAndReopen(options); + generate_files(); +#ifndef ROCKSDB_LITE + ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel()); +#endif // !ROCKSDB_LITE + int run_manual_compactions = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::Run():PausingManualCompaction:1", + [&](void* /*arg*/) { run_manual_compactions++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + dbfull()->DisableManualCompaction(); + dbfull()->CompactRange(compact_options, nullptr, nullptr); + dbfull()->TEST_WaitForCompact(true); + // As manual compaction disabled, not even reach sync point + ASSERT_EQ(run_manual_compactions, 0); +#ifndef ROCKSDB_LITE + ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel()); +#endif // !ROCKSDB_LITE + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( + "CompactionJob::Run():PausingManualCompaction:1"); + dbfull()->EnableManualCompaction(); + dbfull()->CompactRange(compact_options, nullptr, nullptr); + dbfull()->TEST_WaitForCompact(true); +#ifndef ROCKSDB_LITE + ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel()); +#endif // !ROCKSDB_LITE + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTest2, PausingManualCompaction4) { + CompactRangeOptions compact_options; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.num_levels = 7; + + Random rnd(301); + auto generate_files = [&]() { + for (int i = 0; i < options.num_levels; i++) { + for (int j = 0; j < options.num_levels - i + 1; j++) { + for (int k = 0; k < 1000; k++) { + ASSERT_OK(Put(Key(k + j * 1000), RandomString(&rnd, 50))); + } + Flush(); + } + + for (int l = 1; l < options.num_levels - i; l++) { + MoveFilesToLevel(l); + } + } + }; + + DestroyAndReopen(options); + generate_files(); +#ifndef ROCKSDB_LITE + ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel()); +#endif // !ROCKSDB_LITE + int run_manual_compactions = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::Run():PausingManualCompaction:2", [&](void* arg) { + auto paused = reinterpret_cast*>(arg); + ASSERT_FALSE(paused->load(std::memory_order_acquire)); + paused->store(true, std::memory_order_release); + run_manual_compactions++; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + dbfull()->EnableManualCompaction(); + dbfull()->CompactRange(compact_options, nullptr, nullptr); + dbfull()->TEST_WaitForCompact(true); + ASSERT_EQ(run_manual_compactions, 1); +#ifndef ROCKSDB_LITE + ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel()); +#endif // !ROCKSDB_LITE + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( + "CompactionJob::Run():PausingManualCompaction:2"); + dbfull()->EnableManualCompaction(); + dbfull()->CompactRange(compact_options, nullptr, nullptr); + dbfull()->TEST_WaitForCompact(true); +#ifndef ROCKSDB_LITE + ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel()); +#endif // !ROCKSDB_LITE + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTest2, OptimizeForPointLookup) { + Options options = CurrentOptions(); + Close(); + options.OptimizeForPointLookup(2); + ASSERT_OK(DB::Open(options, dbname_, &db_)); + + ASSERT_OK(Put("foo", "v1")); + ASSERT_EQ("v1", Get("foo")); + Flush(); + ASSERT_EQ("v1", Get("foo")); +} + +TEST_F(DBTest2, OptimizeForSmallDB) { + Options options = CurrentOptions(); + Close(); + options.OptimizeForSmallDb(); + + // Find the cache object + ASSERT_EQ(std::string(BlockBasedTableFactory::kName), + std::string(options.table_factory->Name())); + BlockBasedTableOptions* table_options = + reinterpret_cast( + options.table_factory->GetOptions()); + ASSERT_TRUE(table_options != nullptr); + std::shared_ptr cache = table_options->block_cache; + + ASSERT_EQ(0, cache->GetUsage()); + ASSERT_OK(DB::Open(options, dbname_, &db_)); + ASSERT_OK(Put("foo", "v1")); + + // memtable size is costed to the block cache + ASSERT_NE(0, cache->GetUsage()); + + ASSERT_EQ("v1", Get("foo")); + Flush(); + + size_t prev_size = cache->GetUsage(); + // Remember block cache size, so that we can find that + // it is filled after Get(). + // Use pinnable slice so that it can ping the block so that + // when we check the size it is not evicted. + PinnableSlice value; + ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), "foo", &value)); + ASSERT_GT(cache->GetUsage(), prev_size); + value.Reset(); +} + +#endif // ROCKSDB_LITE + +TEST_F(DBTest2, GetRaceFlush1) { + ASSERT_OK(Put("foo", "v1")); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::GetImpl:1", "DBTest2::GetRaceFlush:1"}, + {"DBTest2::GetRaceFlush:2", "DBImpl::GetImpl:2"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::port::Thread t1([&] { + TEST_SYNC_POINT("DBTest2::GetRaceFlush:1"); + ASSERT_OK(Put("foo", "v2")); + Flush(); + TEST_SYNC_POINT("DBTest2::GetRaceFlush:2"); + }); + + // Get() is issued after the first Put(), so it should see either + // "v1" or "v2". + ASSERT_NE("NOT_FOUND", Get("foo")); + t1.join(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTest2, GetRaceFlush2) { + ASSERT_OK(Put("foo", "v1")); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::GetImpl:3", "DBTest2::GetRaceFlush:1"}, + {"DBTest2::GetRaceFlush:2", "DBImpl::GetImpl:4"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + port::Thread t1([&] { + TEST_SYNC_POINT("DBTest2::GetRaceFlush:1"); + ASSERT_OK(Put("foo", "v2")); + Flush(); + TEST_SYNC_POINT("DBTest2::GetRaceFlush:2"); + }); + + // Get() is issued after the first Put(), so it should see either + // "v1" or "v2". + ASSERT_NE("NOT_FOUND", Get("foo")); + t1.join(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTest2, DirectIO) { + if (!IsDirectIOSupported()) { + return; + } + Options options = CurrentOptions(); + options.use_direct_reads = options.use_direct_io_for_flush_and_compaction = + true; + options.allow_mmap_reads = options.allow_mmap_writes = false; + DestroyAndReopen(options); + + ASSERT_OK(Put(Key(0), "a")); + ASSERT_OK(Put(Key(5), "a")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(Key(10), "a")); + ASSERT_OK(Put(Key(15), "a")); + ASSERT_OK(Flush()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + Reopen(options); +} + +TEST_F(DBTest2, MemtableOnlyIterator) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "foo", "first")); + ASSERT_OK(Put(1, "bar", "second")); + + ReadOptions ropt; + ropt.read_tier = kMemtableTier; + std::string value; + Iterator* it = nullptr; + + // Before flushing + // point lookups + ASSERT_OK(db_->Get(ropt, handles_[1], "foo", &value)); + ASSERT_EQ("first", value); + ASSERT_OK(db_->Get(ropt, handles_[1], "bar", &value)); + ASSERT_EQ("second", value); + + // Memtable-only iterator (read_tier=kMemtableTier); data not flushed yet. + it = db_->NewIterator(ropt, handles_[1]); + int count = 0; + for (it->SeekToFirst(); it->Valid(); it->Next()) { + ASSERT_TRUE(it->Valid()); + count++; + } + ASSERT_TRUE(!it->Valid()); + ASSERT_EQ(2, count); + delete it; + + Flush(1); + + // After flushing + // point lookups + ASSERT_OK(db_->Get(ropt, handles_[1], "foo", &value)); + ASSERT_EQ("first", value); + ASSERT_OK(db_->Get(ropt, handles_[1], "bar", &value)); + ASSERT_EQ("second", value); + // nothing should be returned using memtable-only iterator after flushing. + it = db_->NewIterator(ropt, handles_[1]); + count = 0; + for (it->SeekToFirst(); it->Valid(); it->Next()) { + ASSERT_TRUE(it->Valid()); + count++; + } + ASSERT_TRUE(!it->Valid()); + ASSERT_EQ(0, count); + delete it; + + // Add a key to memtable + ASSERT_OK(Put(1, "foobar", "third")); + it = db_->NewIterator(ropt, handles_[1]); + count = 0; + for (it->SeekToFirst(); it->Valid(); it->Next()) { + ASSERT_TRUE(it->Valid()); + ASSERT_EQ("foobar", it->key().ToString()); + ASSERT_EQ("third", it->value().ToString()); + count++; + } + ASSERT_TRUE(!it->Valid()); + ASSERT_EQ(1, count); + delete it; +} + +TEST_F(DBTest2, LowPriWrite) { + Options options = CurrentOptions(); + // Compaction pressure should trigger since 6 files + options.level0_file_num_compaction_trigger = 4; + options.level0_slowdown_writes_trigger = 12; + options.level0_stop_writes_trigger = 30; + options.delayed_write_rate = 8 * 1024 * 1024; + Reopen(options); + + std::atomic rate_limit_count(0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "GenericRateLimiter::Request:1", [&](void* arg) { + rate_limit_count.fetch_add(1); + int64_t* rate_bytes_per_sec = static_cast(arg); + ASSERT_EQ(1024 * 1024, *rate_bytes_per_sec); + }); + // Block compaction + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"DBTest.LowPriWrite:0", "DBImpl::BGWorkCompaction"}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + WriteOptions wo; + for (int i = 0; i < 6; i++) { + wo.low_pri = false; + Put("", "", wo); + wo.low_pri = true; + Put("", "", wo); + Flush(); + } + ASSERT_EQ(0, rate_limit_count.load()); + wo.low_pri = true; + Put("", "", wo); + ASSERT_EQ(1, rate_limit_count.load()); + wo.low_pri = false; + Put("", "", wo); + ASSERT_EQ(1, rate_limit_count.load()); + + TEST_SYNC_POINT("DBTest.LowPriWrite:0"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + dbfull()->TEST_WaitForCompact(); + wo.low_pri = true; + Put("", "", wo); + ASSERT_EQ(1, rate_limit_count.load()); + wo.low_pri = false; + Put("", "", wo); + ASSERT_EQ(1, rate_limit_count.load()); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBTest2, RateLimitedCompactionReads) { + // compaction input has 512KB data + const int kNumKeysPerFile = 128; + const int kBytesPerKey = 1024; + const int kNumL0Files = 4; + + for (auto use_direct_io : {false, true}) { + if (use_direct_io && !IsDirectIOSupported()) { + continue; + } + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = kNumL0Files; + options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile)); + options.new_table_reader_for_compaction_inputs = true; + // takes roughly one second, split into 100 x 10ms intervals. Each interval + // permits 5.12KB, which is smaller than the block size, so this test + // exercises the code for chunking reads. + options.rate_limiter.reset(NewGenericRateLimiter( + static_cast(kNumL0Files * kNumKeysPerFile * + kBytesPerKey) /* rate_bytes_per_sec */, + 10 * 1000 /* refill_period_us */, 10 /* fairness */, + RateLimiter::Mode::kReadsOnly)); + options.use_direct_reads = options.use_direct_io_for_flush_and_compaction = + use_direct_io; + BlockBasedTableOptions bbto; + bbto.block_size = 16384; + bbto.no_block_cache = true; + options.table_factory.reset(new BlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + for (int i = 0; i < kNumL0Files; ++i) { + for (int j = 0; j <= kNumKeysPerFile; ++j) { + ASSERT_OK(Put(Key(j), DummyString(kBytesPerKey))); + } + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(i + 1, NumTableFilesAtLevel(0)); + } + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + + ASSERT_EQ(0, options.rate_limiter->GetTotalBytesThrough(Env::IO_HIGH)); + // should be slightly above 512KB due to non-data blocks read. Arbitrarily + // chose 1MB as the upper bound on the total bytes read. + size_t rate_limited_bytes = + options.rate_limiter->GetTotalBytesThrough(Env::IO_LOW); + // Include the explicit prefetch of the footer in direct I/O case. + size_t direct_io_extra = use_direct_io ? 512 * 1024 : 0; + ASSERT_GE( + rate_limited_bytes, + static_cast(kNumKeysPerFile * kBytesPerKey * kNumL0Files)); + ASSERT_LT( + rate_limited_bytes, + static_cast(2 * kNumKeysPerFile * kBytesPerKey * kNumL0Files + + direct_io_extra)); + + Iterator* iter = db_->NewIterator(ReadOptions()); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_EQ(iter->value().ToString(), DummyString(kBytesPerKey)); + } + delete iter; + // bytes read for user iterator shouldn't count against the rate limit. + ASSERT_EQ(rate_limited_bytes, + static_cast( + options.rate_limiter->GetTotalBytesThrough(Env::IO_LOW))); + } +} +#endif // ROCKSDB_LITE + +// Make sure DB can be reopen with reduced number of levels, given no file +// is on levels higher than the new num_levels. +TEST_F(DBTest2, ReduceLevel) { + Options options; + options.disable_auto_compactions = true; + options.num_levels = 7; + Reopen(options); + Put("foo", "bar"); + Flush(); + MoveFilesToLevel(6); +#ifndef ROCKSDB_LITE + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); +#endif // !ROCKSDB_LITE + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 1; + dbfull()->CompactRange(compact_options, nullptr, nullptr); +#ifndef ROCKSDB_LITE + ASSERT_EQ("0,1", FilesPerLevel()); +#endif // !ROCKSDB_LITE + options.num_levels = 3; + Reopen(options); +#ifndef ROCKSDB_LITE + ASSERT_EQ("0,1", FilesPerLevel()); +#endif // !ROCKSDB_LITE +} + +// Test that ReadCallback is actually used in both memtbale and sst tables +TEST_F(DBTest2, ReadCallbackTest) { + Options options; + options.disable_auto_compactions = true; + options.num_levels = 7; + Reopen(options); + std::vector snapshots; + // Try to create a db with multiple layers and a memtable + const std::string key = "foo"; + const std::string value = "bar"; + // This test assumes that the seq start with 1 and increased by 1 after each + // write batch of size 1. If that behavior changes, the test needs to be + // updated as well. + // TODO(myabandeh): update this test to use the seq number that is returned by + // the DB instead of assuming what seq the DB used. + int i = 1; + for (; i < 10; i++) { + Put(key, value + std::to_string(i)); + // Take a snapshot to avoid the value being removed during compaction + auto snapshot = dbfull()->GetSnapshot(); + snapshots.push_back(snapshot); + } + Flush(); + for (; i < 20; i++) { + Put(key, value + std::to_string(i)); + // Take a snapshot to avoid the value being removed during compaction + auto snapshot = dbfull()->GetSnapshot(); + snapshots.push_back(snapshot); + } + Flush(); + MoveFilesToLevel(6); +#ifndef ROCKSDB_LITE + ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel()); +#endif // !ROCKSDB_LITE + for (; i < 30; i++) { + Put(key, value + std::to_string(i)); + auto snapshot = dbfull()->GetSnapshot(); + snapshots.push_back(snapshot); + } + Flush(); +#ifndef ROCKSDB_LITE + ASSERT_EQ("1,0,0,0,0,0,2", FilesPerLevel()); +#endif // !ROCKSDB_LITE + // And also add some values to the memtable + for (; i < 40; i++) { + Put(key, value + std::to_string(i)); + auto snapshot = dbfull()->GetSnapshot(); + snapshots.push_back(snapshot); + } + + class TestReadCallback : public ReadCallback { + public: + explicit TestReadCallback(SequenceNumber snapshot) + : ReadCallback(snapshot), snapshot_(snapshot) {} + bool IsVisibleFullCheck(SequenceNumber seq) override { + return seq <= snapshot_; + } + + private: + SequenceNumber snapshot_; + }; + + for (int seq = 1; seq < i; seq++) { + PinnableSlice pinnable_val; + ReadOptions roptions; + TestReadCallback callback(seq); + bool dont_care = true; + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = dbfull()->DefaultColumnFamily(); + get_impl_options.value = &pinnable_val; + get_impl_options.value_found = &dont_care; + get_impl_options.callback = &callback; + Status s = dbfull()->GetImpl(roptions, key, get_impl_options); + ASSERT_TRUE(s.ok()); + // Assuming that after each Put the DB increased seq by one, the value and + // seq number must be equal since we also inc value by 1 after each Put. + ASSERT_EQ(value + std::to_string(seq), pinnable_val.ToString()); + } + + for (auto snapshot : snapshots) { + dbfull()->ReleaseSnapshot(snapshot); + } +} + +#ifndef ROCKSDB_LITE + +TEST_F(DBTest2, LiveFilesOmitObsoleteFiles) { + // Regression test for race condition where an obsolete file is returned to + // user as a "live file" but then deleted, all while file deletions are + // disabled. + // + // It happened like this: + // + // 1. [flush thread] Log file "x.log" found by FindObsoleteFiles + // 2. [user thread] DisableFileDeletions, GetSortedWalFiles are called and the + // latter returned "x.log" + // 3. [flush thread] PurgeObsoleteFiles deleted "x.log" + // 4. [user thread] Reading "x.log" failed + // + // Unfortunately the only regression test I can come up with involves sleep. + // We cannot set SyncPoints to repro since, once the fix is applied, the + // SyncPoints would cause a deadlock as the repro's sequence of events is now + // prohibited. + // + // Instead, if we sleep for a second between Find and Purge, and ensure the + // read attempt happens after purge, then the sequence of events will almost + // certainly happen on the old code. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"DBImpl::BackgroundCallFlush:FilesFound", + "DBTest2::LiveFilesOmitObsoleteFiles:FlushTriggered"}, + {"DBImpl::PurgeObsoleteFiles:End", + "DBTest2::LiveFilesOmitObsoleteFiles:LiveFilesCaptured"}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::PurgeObsoleteFiles:Begin", + [&](void* /*arg*/) { env_->SleepForMicroseconds(1000000); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Put("key", "val"); + FlushOptions flush_opts; + flush_opts.wait = false; + db_->Flush(flush_opts); + TEST_SYNC_POINT("DBTest2::LiveFilesOmitObsoleteFiles:FlushTriggered"); + + db_->DisableFileDeletions(); + VectorLogPtr log_files; + db_->GetSortedWalFiles(log_files); + TEST_SYNC_POINT("DBTest2::LiveFilesOmitObsoleteFiles:LiveFilesCaptured"); + for (const auto& log_file : log_files) { + ASSERT_OK(env_->FileExists(LogFileName(dbname_, log_file->LogNumber()))); + } + + db_->EnableFileDeletions(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTest2, TestNumPread) { + Options options = CurrentOptions(); + // disable block cache + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + env_->count_random_reads_ = true; + + env_->random_file_open_counter_.store(0); + ASSERT_OK(Put("bar", "foo")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + // After flush, we'll open the file and read footer, meta block, + // property block and index block. + ASSERT_EQ(4, env_->random_read_counter_.Read()); + ASSERT_EQ(1, env_->random_file_open_counter_.load()); + + // One pread per a normal data block read + env_->random_file_open_counter_.store(0); + env_->random_read_counter_.Reset(); + ASSERT_EQ("bar", Get("foo")); + ASSERT_EQ(1, env_->random_read_counter_.Read()); + // All files are already opened. + ASSERT_EQ(0, env_->random_file_open_counter_.load()); + + env_->random_file_open_counter_.store(0); + env_->random_read_counter_.Reset(); + ASSERT_OK(Put("bar2", "foo2")); + ASSERT_OK(Put("foo2", "bar2")); + ASSERT_OK(Flush()); + // After flush, we'll open the file and read footer, meta block, + // property block and index block. + ASSERT_EQ(4, env_->random_read_counter_.Read()); + ASSERT_EQ(1, env_->random_file_open_counter_.load()); + + // Compaction needs two input blocks, which requires 2 preads, and + // generate a new SST file which needs 4 preads (footer, meta block, + // property block and index block). In total 6. + env_->random_file_open_counter_.store(0); + env_->random_read_counter_.Reset(); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(6, env_->random_read_counter_.Read()); + // All compactin input files should have already been opened. + ASSERT_EQ(1, env_->random_file_open_counter_.load()); + + // One pread per a normal data block read + env_->random_file_open_counter_.store(0); + env_->random_read_counter_.Reset(); + ASSERT_EQ("foo2", Get("bar2")); + ASSERT_EQ(1, env_->random_read_counter_.Read()); + // SST files are already opened. + ASSERT_EQ(0, env_->random_file_open_counter_.load()); +} + +TEST_F(DBTest2, TraceAndReplay) { + Options options = CurrentOptions(); + options.merge_operator = MergeOperators::CreatePutOperator(); + ReadOptions ro; + WriteOptions wo; + TraceOptions trace_opts; + EnvOptions env_opts; + CreateAndReopenWithCF({"pikachu"}, options); + Random rnd(301); + Iterator* single_iter = nullptr; + + ASSERT_TRUE(db_->EndTrace().IsIOError()); + + std::string trace_filename = dbname_ + "/rocksdb.trace"; + std::unique_ptr trace_writer; + ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer)); + ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer))); + + ASSERT_OK(Put(0, "a", "1")); + ASSERT_OK(Merge(0, "b", "2")); + ASSERT_OK(Delete(0, "c")); + ASSERT_OK(SingleDelete(0, "d")); + ASSERT_OK(db_->DeleteRange(wo, dbfull()->DefaultColumnFamily(), "e", "f")); + + WriteBatch batch; + ASSERT_OK(batch.Put("f", "11")); + ASSERT_OK(batch.Merge("g", "12")); + ASSERT_OK(batch.Delete("h")); + ASSERT_OK(batch.SingleDelete("i")); + ASSERT_OK(batch.DeleteRange("j", "k")); + ASSERT_OK(db_->Write(wo, &batch)); + + single_iter = db_->NewIterator(ro); + single_iter->Seek("f"); + single_iter->SeekForPrev("g"); + delete single_iter; + + ASSERT_EQ("1", Get(0, "a")); + ASSERT_EQ("12", Get(0, "g")); + + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Put(1, "rocksdb", "rocks")); + ASSERT_EQ("NOT_FOUND", Get(1, "leveldb")); + + ASSERT_OK(db_->EndTrace()); + // These should not get into the trace file as it is after EndTrace. + Put("hello", "world"); + Merge("foo", "bar"); + + // Open another db, replay, and verify the data + std::string value; + std::string dbname2 = test::TmpDir(env_) + "/db_replay"; + ASSERT_OK(DestroyDB(dbname2, options)); + + // Using a different name than db2, to pacify infer's use-after-lifetime + // warnings (http://fbinfer.com). + DB* db2_init = nullptr; + options.create_if_missing = true; + ASSERT_OK(DB::Open(options, dbname2, &db2_init)); + ColumnFamilyHandle* cf; + ASSERT_OK( + db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf)); + delete cf; + delete db2_init; + + DB* db2 = nullptr; + std::vector column_families; + ColumnFamilyOptions cf_options; + cf_options.merge_operator = MergeOperators::CreatePutOperator(); + column_families.push_back(ColumnFamilyDescriptor("default", cf_options)); + column_families.push_back( + ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions())); + std::vector handles; + ASSERT_OK(DB::Open(DBOptions(), dbname2, column_families, &handles, &db2)); + + env_->SleepForMicroseconds(100); + // Verify that the keys don't already exist + ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound()); + + std::unique_ptr trace_reader; + ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader)); + Replayer replayer(db2, handles_, std::move(trace_reader)); + ASSERT_OK(replayer.Replay()); + + ASSERT_OK(db2->Get(ro, handles[0], "a", &value)); + ASSERT_EQ("1", value); + ASSERT_OK(db2->Get(ro, handles[0], "g", &value)); + ASSERT_EQ("12", value); + ASSERT_TRUE(db2->Get(ro, handles[0], "hello", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "world", &value).IsNotFound()); + + ASSERT_OK(db2->Get(ro, handles[1], "foo", &value)); + ASSERT_EQ("bar", value); + ASSERT_OK(db2->Get(ro, handles[1], "rocksdb", &value)); + ASSERT_EQ("rocks", value); + + for (auto handle : handles) { + delete handle; + } + delete db2; + ASSERT_OK(DestroyDB(dbname2, options)); +} + +TEST_F(DBTest2, TraceWithLimit) { + Options options = CurrentOptions(); + options.merge_operator = MergeOperators::CreatePutOperator(); + ReadOptions ro; + WriteOptions wo; + TraceOptions trace_opts; + EnvOptions env_opts; + CreateAndReopenWithCF({"pikachu"}, options); + Random rnd(301); + + // test the max trace file size options + trace_opts.max_trace_file_size = 5; + std::string trace_filename = dbname_ + "/rocksdb.trace1"; + std::unique_ptr trace_writer; + ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer)); + ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer))); + ASSERT_OK(Put(0, "a", "1")); + ASSERT_OK(Put(0, "b", "1")); + ASSERT_OK(Put(0, "c", "1")); + ASSERT_OK(db_->EndTrace()); + + std::string dbname2 = test::TmpDir(env_) + "/db_replay2"; + std::string value; + ASSERT_OK(DestroyDB(dbname2, options)); + + // Using a different name than db2, to pacify infer's use-after-lifetime + // warnings (http://fbinfer.com). + DB* db2_init = nullptr; + options.create_if_missing = true; + ASSERT_OK(DB::Open(options, dbname2, &db2_init)); + ColumnFamilyHandle* cf; + ASSERT_OK( + db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf)); + delete cf; + delete db2_init; + + DB* db2 = nullptr; + std::vector column_families; + ColumnFamilyOptions cf_options; + cf_options.merge_operator = MergeOperators::CreatePutOperator(); + column_families.push_back(ColumnFamilyDescriptor("default", cf_options)); + column_families.push_back( + ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions())); + std::vector handles; + ASSERT_OK(DB::Open(DBOptions(), dbname2, column_families, &handles, &db2)); + + env_->SleepForMicroseconds(100); + // Verify that the keys don't already exist + ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound()); + + std::unique_ptr trace_reader; + ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader)); + Replayer replayer(db2, handles_, std::move(trace_reader)); + ASSERT_OK(replayer.Replay()); + + ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound()); + + for (auto handle : handles) { + delete handle; + } + delete db2; + ASSERT_OK(DestroyDB(dbname2, options)); +} + +TEST_F(DBTest2, TraceWithSampling) { + Options options = CurrentOptions(); + ReadOptions ro; + WriteOptions wo; + TraceOptions trace_opts; + EnvOptions env_opts; + CreateAndReopenWithCF({"pikachu"}, options); + Random rnd(301); + + // test the trace file sampling options + trace_opts.sampling_frequency = 2; + std::string trace_filename = dbname_ + "/rocksdb.trace_sampling"; + std::unique_ptr trace_writer; + ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer)); + ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer))); + ASSERT_OK(Put(0, "a", "1")); + ASSERT_OK(Put(0, "b", "2")); + ASSERT_OK(Put(0, "c", "3")); + ASSERT_OK(Put(0, "d", "4")); + ASSERT_OK(Put(0, "e", "5")); + ASSERT_OK(db_->EndTrace()); + + std::string dbname2 = test::TmpDir(env_) + "/db_replay_sampling"; + std::string value; + ASSERT_OK(DestroyDB(dbname2, options)); + + // Using a different name than db2, to pacify infer's use-after-lifetime + // warnings (http://fbinfer.com). + DB* db2_init = nullptr; + options.create_if_missing = true; + ASSERT_OK(DB::Open(options, dbname2, &db2_init)); + ColumnFamilyHandle* cf; + ASSERT_OK( + db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf)); + delete cf; + delete db2_init; + + DB* db2 = nullptr; + std::vector column_families; + ColumnFamilyOptions cf_options; + column_families.push_back(ColumnFamilyDescriptor("default", cf_options)); + column_families.push_back( + ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions())); + std::vector handles; + ASSERT_OK(DB::Open(DBOptions(), dbname2, column_families, &handles, &db2)); + + env_->SleepForMicroseconds(100); + ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "d", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "e", &value).IsNotFound()); + + std::unique_ptr trace_reader; + ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader)); + Replayer replayer(db2, handles_, std::move(trace_reader)); + ASSERT_OK(replayer.Replay()); + + ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound()); + ASSERT_FALSE(db2->Get(ro, handles[0], "b", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound()); + ASSERT_FALSE(db2->Get(ro, handles[0], "d", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "e", &value).IsNotFound()); + + for (auto handle : handles) { + delete handle; + } + delete db2; + ASSERT_OK(DestroyDB(dbname2, options)); +} + +TEST_F(DBTest2, TraceWithFilter) { + Options options = CurrentOptions(); + options.merge_operator = MergeOperators::CreatePutOperator(); + ReadOptions ro; + WriteOptions wo; + TraceOptions trace_opts; + EnvOptions env_opts; + CreateAndReopenWithCF({"pikachu"}, options); + Random rnd(301); + Iterator* single_iter = nullptr; + + trace_opts.filter = TraceFilterType::kTraceFilterWrite; + + std::string trace_filename = dbname_ + "/rocksdb.trace"; + std::unique_ptr trace_writer; + ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer)); + ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer))); + + ASSERT_OK(Put(0, "a", "1")); + ASSERT_OK(Merge(0, "b", "2")); + ASSERT_OK(Delete(0, "c")); + ASSERT_OK(SingleDelete(0, "d")); + ASSERT_OK(db_->DeleteRange(wo, dbfull()->DefaultColumnFamily(), "e", "f")); + + WriteBatch batch; + ASSERT_OK(batch.Put("f", "11")); + ASSERT_OK(batch.Merge("g", "12")); + ASSERT_OK(batch.Delete("h")); + ASSERT_OK(batch.SingleDelete("i")); + ASSERT_OK(batch.DeleteRange("j", "k")); + ASSERT_OK(db_->Write(wo, &batch)); + + single_iter = db_->NewIterator(ro); + single_iter->Seek("f"); + single_iter->SeekForPrev("g"); + delete single_iter; + + ASSERT_EQ("1", Get(0, "a")); + ASSERT_EQ("12", Get(0, "g")); + + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Put(1, "rocksdb", "rocks")); + ASSERT_EQ("NOT_FOUND", Get(1, "leveldb")); + + ASSERT_OK(db_->EndTrace()); + // These should not get into the trace file as it is after EndTrace. + Put("hello", "world"); + Merge("foo", "bar"); + + // Open another db, replay, and verify the data + std::string value; + std::string dbname2 = test::TmpDir(env_) + "/db_replay"; + ASSERT_OK(DestroyDB(dbname2, options)); + + // Using a different name than db2, to pacify infer's use-after-lifetime + // warnings (http://fbinfer.com). + DB* db2_init = nullptr; + options.create_if_missing = true; + ASSERT_OK(DB::Open(options, dbname2, &db2_init)); + ColumnFamilyHandle* cf; + ASSERT_OK( + db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf)); + delete cf; + delete db2_init; + + DB* db2 = nullptr; + std::vector column_families; + ColumnFamilyOptions cf_options; + cf_options.merge_operator = MergeOperators::CreatePutOperator(); + column_families.push_back(ColumnFamilyDescriptor("default", cf_options)); + column_families.push_back( + ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions())); + std::vector handles; + ASSERT_OK(DB::Open(DBOptions(), dbname2, column_families, &handles, &db2)); + + env_->SleepForMicroseconds(100); + // Verify that the keys don't already exist + ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound()); + + std::unique_ptr trace_reader; + ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader)); + Replayer replayer(db2, handles_, std::move(trace_reader)); + ASSERT_OK(replayer.Replay()); + + // All the key-values should not present since we filter out the WRITE ops. + ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "hello", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "world", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "foo", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "rocksdb", &value).IsNotFound()); + + for (auto handle : handles) { + delete handle; + } + delete db2; + ASSERT_OK(DestroyDB(dbname2, options)); + + // Set up a new db. + std::string dbname3 = test::TmpDir(env_) + "/db_not_trace_read"; + ASSERT_OK(DestroyDB(dbname3, options)); + + DB* db3_init = nullptr; + options.create_if_missing = true; + ColumnFamilyHandle* cf3; + ASSERT_OK(DB::Open(options, dbname3, &db3_init)); + ASSERT_OK( + db3_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf3)); + delete cf3; + delete db3_init; + + column_families.clear(); + column_families.push_back(ColumnFamilyDescriptor("default", cf_options)); + column_families.push_back( + ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions())); + handles.clear(); + + DB* db3 = nullptr; + ASSERT_OK(DB::Open(DBOptions(), dbname3, column_families, &handles, &db3)); + + env_->SleepForMicroseconds(100); + // Verify that the keys don't already exist + ASSERT_TRUE(db3->Get(ro, handles[0], "a", &value).IsNotFound()); + ASSERT_TRUE(db3->Get(ro, handles[0], "g", &value).IsNotFound()); + + //The tracer will not record the READ ops. + trace_opts.filter = TraceFilterType::kTraceFilterGet; + std::string trace_filename3 = dbname_ + "/rocksdb.trace_3"; + std::unique_ptr trace_writer3; + ASSERT_OK( + NewFileTraceWriter(env_, env_opts, trace_filename3, &trace_writer3)); + ASSERT_OK(db3->StartTrace(trace_opts, std::move(trace_writer3))); + + ASSERT_OK(db3->Put(wo, handles[0], "a", "1")); + ASSERT_OK(db3->Merge(wo, handles[0], "b", "2")); + ASSERT_OK(db3->Delete(wo, handles[0], "c")); + ASSERT_OK(db3->SingleDelete(wo, handles[0], "d")); + + ASSERT_OK(db3->Get(ro, handles[0], "a", &value)); + ASSERT_EQ(value, "1"); + ASSERT_TRUE(db3->Get(ro, handles[0], "c", &value).IsNotFound()); + + ASSERT_OK(db3->EndTrace()); + + for (auto handle : handles) { + delete handle; + } + delete db3; + ASSERT_OK(DestroyDB(dbname3, options)); + + std::unique_ptr trace_reader3; + ASSERT_OK( + NewFileTraceReader(env_, env_opts, trace_filename3, &trace_reader3)); + + // Count the number of records in the trace file; + int count = 0; + std::string data; + Status s; + while (true) { + s = trace_reader3->Read(&data); + if (!s.ok()) { + break; + } + count += 1; + } + // We also need to count the header and footer + // 4 WRITE + HEADER + FOOTER = 6 + ASSERT_EQ(count, 6); +} + +#endif // ROCKSDB_LITE + +TEST_F(DBTest2, PinnableSliceAndMmapReads) { + Options options = CurrentOptions(); + options.allow_mmap_reads = true; + options.max_open_files = 100; + options.compression = kNoCompression; + Reopen(options); + + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + + PinnableSlice pinned_value; + ASSERT_EQ(Get("foo", &pinned_value), Status::OK()); + // It is not safe to pin mmap files as they might disappear by compaction + ASSERT_FALSE(pinned_value.IsPinned()); + ASSERT_EQ(pinned_value.ToString(), "bar"); + + dbfull()->TEST_CompactRange(0 /* level */, nullptr /* begin */, + nullptr /* end */, nullptr /* column_family */, + true /* disallow_trivial_move */); + + // Ensure pinned_value doesn't rely on memory munmap'd by the above + // compaction. It crashes if it does. + ASSERT_EQ(pinned_value.ToString(), "bar"); + +#ifndef ROCKSDB_LITE + pinned_value.Reset(); + // Unsafe to pin mmap files when they could be kicked out of table cache + Close(); + ASSERT_OK(ReadOnlyReopen(options)); + ASSERT_EQ(Get("foo", &pinned_value), Status::OK()); + ASSERT_FALSE(pinned_value.IsPinned()); + ASSERT_EQ(pinned_value.ToString(), "bar"); + + pinned_value.Reset(); + // In read-only mode with infinite capacity on table cache it should pin the + // value and avoid the memcpy + Close(); + options.max_open_files = -1; + ASSERT_OK(ReadOnlyReopen(options)); + ASSERT_EQ(Get("foo", &pinned_value), Status::OK()); + ASSERT_TRUE(pinned_value.IsPinned()); + ASSERT_EQ(pinned_value.ToString(), "bar"); +#endif +} + +TEST_F(DBTest2, DISABLED_IteratorPinnedMemory) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + BlockBasedTableOptions bbto; + bbto.no_block_cache = false; + bbto.cache_index_and_filter_blocks = false; + bbto.block_cache = NewLRUCache(100000); + bbto.block_size = 400; // small block size + options.table_factory.reset(new BlockBasedTableFactory(bbto)); + Reopen(options); + + Random rnd(301); + std::string v = RandomString(&rnd, 400); + + // Since v is the size of a block, each key should take a block + // of 400+ bytes. + Put("1", v); + Put("3", v); + Put("5", v); + Put("7", v); + ASSERT_OK(Flush()); + + ASSERT_EQ(0, bbto.block_cache->GetPinnedUsage()); + + // Verify that iterators don't pin more than one data block in block cache + // at each time. + { + std::unique_ptr iter(db_->NewIterator(ReadOptions())); + iter->SeekToFirst(); + + for (int i = 0; i < 4; i++) { + ASSERT_TRUE(iter->Valid()); + // Block cache should contain exactly one block. + ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0); + ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800); + iter->Next(); + } + ASSERT_FALSE(iter->Valid()); + + iter->Seek("4"); + ASSERT_TRUE(iter->Valid()); + + ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0); + ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800); + + iter->Seek("3"); + ASSERT_TRUE(iter->Valid()); + + ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0); + ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800); + } + ASSERT_EQ(0, bbto.block_cache->GetPinnedUsage()); + + // Test compaction case + Put("2", v); + Put("5", v); + Put("6", v); + Put("8", v); + ASSERT_OK(Flush()); + + // Clear existing data in block cache + bbto.block_cache->SetCapacity(0); + bbto.block_cache->SetCapacity(100000); + + // Verify compaction input iterators don't hold more than one data blocks at + // one time. + std::atomic finished(false); + std::atomic block_newed(0); + std::atomic block_destroyed(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "Block::Block:0", [&](void* /*arg*/) { + if (finished) { + return; + } + // Two iterators. At most 2 outstanding blocks. + EXPECT_GE(block_newed.load(), block_destroyed.load()); + EXPECT_LE(block_newed.load(), block_destroyed.load() + 1); + block_newed.fetch_add(1); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "Block::~Block", [&](void* /*arg*/) { + if (finished) { + return; + } + // Two iterators. At most 2 outstanding blocks. + EXPECT_GE(block_newed.load(), block_destroyed.load() + 1); + EXPECT_LE(block_newed.load(), block_destroyed.load() + 2); + block_destroyed.fetch_add(1); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::Run:BeforeVerify", + [&](void* /*arg*/) { finished = true; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + // Two input files. Each of them has 4 data blocks. + ASSERT_EQ(8, block_newed.load()); + ASSERT_EQ(8, block_destroyed.load()); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTest2, TestBBTTailPrefetch) { + std::atomic called(false); + size_t expected_lower_bound = 512 * 1024; + size_t expected_higher_bound = 512 * 1024; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) { + size_t* prefetch_size = static_cast(arg); + EXPECT_LE(expected_lower_bound, *prefetch_size); + EXPECT_GE(expected_higher_bound, *prefetch_size); + called = true; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Put("1", "1"); + Put("9", "1"); + Flush(); + + expected_lower_bound = 0; + expected_higher_bound = 8 * 1024; + + Put("1", "1"); + Put("9", "1"); + Flush(); + + Put("1", "1"); + Put("9", "1"); + Flush(); + + // Full compaction to make sure there is no L0 file after the open. + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + ASSERT_TRUE(called.load()); + called = false; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + + std::atomic first_call(true); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) { + size_t* prefetch_size = static_cast(arg); + if (first_call) { + EXPECT_EQ(4 * 1024, *prefetch_size); + first_call = false; + } else { + EXPECT_GE(4 * 1024, *prefetch_size); + } + called = true; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.max_file_opening_threads = 1; // one thread + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.max_open_files = -1; + Reopen(options); + + Put("1", "1"); + Put("9", "1"); + Flush(); + + Put("1", "1"); + Put("9", "1"); + Flush(); + + ASSERT_TRUE(called.load()); + called = false; + + // Parallel loading SST files + options.max_file_opening_threads = 16; + Reopen(options); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + ASSERT_TRUE(called.load()); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(DBTest2, TestGetColumnFamilyHandleUnlocked) { + // Setup sync point dependency to reproduce the race condition of + // DBImpl::GetColumnFamilyHandleUnlocked + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked1", + "TestGetColumnFamilyHandleUnlocked::PreGetColumnFamilyHandleUnlocked2"}, + {"TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked2", + "TestGetColumnFamilyHandleUnlocked::ReadColumnFamilyHandle1"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + + CreateColumnFamilies({"test1", "test2"}, Options()); + ASSERT_EQ(handles_.size(), 2); + + DBImpl* dbi = reinterpret_cast(db_); + port::Thread user_thread1([&]() { + auto cfh = dbi->GetColumnFamilyHandleUnlocked(handles_[0]->GetID()); + ASSERT_EQ(cfh->GetID(), handles_[0]->GetID()); + TEST_SYNC_POINT("TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked1"); + TEST_SYNC_POINT("TestGetColumnFamilyHandleUnlocked::ReadColumnFamilyHandle1"); + ASSERT_EQ(cfh->GetID(), handles_[0]->GetID()); + }); + + port::Thread user_thread2([&]() { + TEST_SYNC_POINT("TestGetColumnFamilyHandleUnlocked::PreGetColumnFamilyHandleUnlocked2"); + auto cfh = dbi->GetColumnFamilyHandleUnlocked(handles_[1]->GetID()); + ASSERT_EQ(cfh->GetID(), handles_[1]->GetID()); + TEST_SYNC_POINT("TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked2"); + ASSERT_EQ(cfh->GetID(), handles_[1]->GetID()); + }); + + user_thread1.join(); + user_thread2.join(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBTest2, TestCompactFiles) { + // Setup sync point dependency to reproduce the race condition of + // DBImpl::GetColumnFamilyHandleUnlocked + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"TestCompactFiles::IngestExternalFile1", + "TestCompactFiles::IngestExternalFile2"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + + Options options; + options.num_levels = 2; + options.disable_auto_compactions = true; + Reopen(options); + auto* handle = db_->DefaultColumnFamily(); + ASSERT_EQ(db_->NumberLevels(handle), 2); + + ROCKSDB_NAMESPACE::SstFileWriter sst_file_writer{ + ROCKSDB_NAMESPACE::EnvOptions(), options}; + std::string external_file1 = dbname_ + "/test_compact_files1.sst_t"; + std::string external_file2 = dbname_ + "/test_compact_files2.sst_t"; + std::string external_file3 = dbname_ + "/test_compact_files3.sst_t"; + + ASSERT_OK(sst_file_writer.Open(external_file1)); + ASSERT_OK(sst_file_writer.Put("1", "1")); + ASSERT_OK(sst_file_writer.Put("2", "2")); + ASSERT_OK(sst_file_writer.Finish()); + + ASSERT_OK(sst_file_writer.Open(external_file2)); + ASSERT_OK(sst_file_writer.Put("3", "3")); + ASSERT_OK(sst_file_writer.Put("4", "4")); + ASSERT_OK(sst_file_writer.Finish()); + + ASSERT_OK(sst_file_writer.Open(external_file3)); + ASSERT_OK(sst_file_writer.Put("5", "5")); + ASSERT_OK(sst_file_writer.Put("6", "6")); + ASSERT_OK(sst_file_writer.Finish()); + + ASSERT_OK(db_->IngestExternalFile(handle, {external_file1, external_file3}, + IngestExternalFileOptions())); + ASSERT_EQ(NumTableFilesAtLevel(1, 0), 2); + std::vector files; + GetSstFiles(env_, dbname_, &files); + ASSERT_EQ(files.size(), 2); + + port::Thread user_thread1( + [&]() { db_->CompactFiles(CompactionOptions(), handle, files, 1); }); + + port::Thread user_thread2([&]() { + ASSERT_OK(db_->IngestExternalFile(handle, {external_file2}, + IngestExternalFileOptions())); + TEST_SYNC_POINT("TestCompactFiles::IngestExternalFile1"); + }); + + user_thread1.join(); + user_thread2.join(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} +#endif // ROCKSDB_LITE + +// TODO: figure out why this test fails in appveyor +#ifndef OS_WIN +TEST_F(DBTest2, MultiDBParallelOpenTest) { + const int kNumDbs = 2; + Options options = CurrentOptions(); + std::vector dbnames; + for (int i = 0; i < kNumDbs; ++i) { + dbnames.emplace_back(test::TmpDir(env_) + "/db" + ToString(i)); + ASSERT_OK(DestroyDB(dbnames.back(), options)); + } + + // Verify empty DBs can be created in parallel + std::vector open_threads; + std::vector dbs{static_cast(kNumDbs), nullptr}; + options.create_if_missing = true; + for (int i = 0; i < kNumDbs; ++i) { + open_threads.emplace_back( + [&](int dbnum) { + ASSERT_OK(DB::Open(options, dbnames[dbnum], &dbs[dbnum])); + }, + i); + } + + // Now add some data and close, so next we can verify non-empty DBs can be + // recovered in parallel + for (int i = 0; i < kNumDbs; ++i) { + open_threads[i].join(); + ASSERT_OK(dbs[i]->Put(WriteOptions(), "xi", "gua")); + delete dbs[i]; + } + + // Verify non-empty DBs can be recovered in parallel + dbs.clear(); + open_threads.clear(); + for (int i = 0; i < kNumDbs; ++i) { + open_threads.emplace_back( + [&](int dbnum) { + ASSERT_OK(DB::Open(options, dbnames[dbnum], &dbs[dbnum])); + }, + i); + } + + // Wait and cleanup + for (int i = 0; i < kNumDbs; ++i) { + open_threads[i].join(); + delete dbs[i]; + ASSERT_OK(DestroyDB(dbnames[i], options)); + } +} +#endif // OS_WIN + +namespace { +class DummyOldStats : public Statistics { + public: + uint64_t getTickerCount(uint32_t /*ticker_type*/) const override { return 0; } + void recordTick(uint32_t /* ticker_type */, uint64_t /* count */) override { + num_rt++; + } + void setTickerCount(uint32_t /*ticker_type*/, uint64_t /*count*/) override {} + uint64_t getAndResetTickerCount(uint32_t /*ticker_type*/) override { + return 0; + } + void measureTime(uint32_t /*histogram_type*/, uint64_t /*count*/) override { + num_mt++; + } + void histogramData( + uint32_t /*histogram_type*/, + ROCKSDB_NAMESPACE::HistogramData* const /*data*/) const override {} + std::string getHistogramString(uint32_t /*type*/) const override { + return ""; + } + bool HistEnabledForType(uint32_t /*type*/) const override { return false; } + std::string ToString() const override { return ""; } + int num_rt = 0; + int num_mt = 0; +}; +} // namespace + +TEST_F(DBTest2, OldStatsInterface) { + DummyOldStats* dos = new DummyOldStats(); + std::shared_ptr stats(dos); + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = stats; + Reopen(options); + + Put("foo", "bar"); + ASSERT_EQ("bar", Get("foo")); + ASSERT_OK(Flush()); + ASSERT_EQ("bar", Get("foo")); + + ASSERT_GT(dos->num_rt, 0); + ASSERT_GT(dos->num_mt, 0); +} + +TEST_F(DBTest2, CloseWithUnreleasedSnapshot) { + const Snapshot* ss = db_->GetSnapshot(); + + for (auto h : handles_) { + db_->DestroyColumnFamilyHandle(h); + } + handles_.clear(); + + ASSERT_NOK(db_->Close()); + db_->ReleaseSnapshot(ss); + ASSERT_OK(db_->Close()); + delete db_; + db_ = nullptr; +} + +TEST_F(DBTest2, PrefixBloomReseek) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.prefix_extractor.reset(NewCappedPrefixTransform(3)); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + // Construct two L1 files with keys: + // f1:[aaa1 ccc1] f2:[ddd0] + ASSERT_OK(Put("aaa1", "")); + ASSERT_OK(Put("ccc1", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("ddd0", "")); + ASSERT_OK(Flush()); + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + ASSERT_OK(Put("bbb1", "")); + + Iterator* iter = db_->NewIterator(ReadOptions()); + + // Seeking into f1, the iterator will check bloom filter which returns the + // file iterator ot be invalidate, and the cursor will put into f2, with + // the next key to be "ddd0". + iter->Seek("bbb1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bbb1", iter->key().ToString()); + + // Reseek ccc1, the L1 iterator needs to go back to f1 and reseek. + iter->Seek("ccc1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("ccc1", iter->key().ToString()); + + delete iter; +} + +TEST_F(DBTest2, PrefixBloomFilteredOut) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.prefix_extractor.reset(NewCappedPrefixTransform(3)); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + // Construct two L1 files with keys: + // f1:[aaa1 ccc1] f2:[ddd0] + ASSERT_OK(Put("aaa1", "")); + ASSERT_OK(Put("ccc1", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("ddd0", "")); + ASSERT_OK(Flush()); + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + Iterator* iter = db_->NewIterator(ReadOptions()); + + // Bloom filter is filterd out by f1. + // This is just one of several valid position following the contract. + // Postioning to ccc1 or ddd0 is also valid. This is just to validate + // the behavior of the current implementation. If underlying implementation + // changes, the test might fail here. + iter->Seek("bbb1"); + ASSERT_FALSE(iter->Valid()); + + delete iter; +} + +#ifndef ROCKSDB_LITE +TEST_F(DBTest2, RowCacheSnapshot) { + Options options = CurrentOptions(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.row_cache = NewLRUCache(8 * 8192); + DestroyAndReopen(options); + + ASSERT_OK(Put("foo", "bar1")); + + const Snapshot* s1 = db_->GetSnapshot(); + + ASSERT_OK(Put("foo", "bar2")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("foo2", "bar")); + const Snapshot* s2 = db_->GetSnapshot(); + ASSERT_OK(Put("foo3", "bar")); + const Snapshot* s3 = db_->GetSnapshot(); + + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 0); + ASSERT_EQ(Get("foo"), "bar2"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1); + ASSERT_EQ(Get("foo"), "bar2"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1); + ASSERT_EQ(Get("foo", s1), "bar1"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2); + ASSERT_EQ(Get("foo", s2), "bar2"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 2); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2); + ASSERT_EQ(Get("foo", s1), "bar1"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 3); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2); + ASSERT_EQ(Get("foo", s3), "bar2"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 4); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2); + + db_->ReleaseSnapshot(s1); + db_->ReleaseSnapshot(s2); + db_->ReleaseSnapshot(s3); +} +#endif // ROCKSDB_LITE + +// When DB is reopened with multiple column families, the manifest file +// is written after the first CF is flushed, and it is written again +// after each flush. If DB crashes between the flushes, the flushed CF +// flushed will pass the latest log file, and now we require it not +// to be corrupted, and triggering a corruption report. +// We need to fix the bug and enable the test. +TEST_F(DBTest2, CrashInRecoveryMultipleCF) { + const std::vector sync_points = { + "DBImpl::RecoverLogFiles:BeforeFlushFinalMemtable", + "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0"}; + for (const auto& test_sync_point : sync_points) { + Options options = CurrentOptions(); + // First destroy original db to ensure a clean start. + DestroyAndReopen(options); + options.create_if_missing = true; + options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put(1, "foo", "bar")); + // The value is large enough to be divided to two blocks. + std::string large_value(400, ' '); + ASSERT_OK(Put("foo1", large_value)); + ASSERT_OK(Put("foo2", large_value)); + Close(); + + // Corrupt the log file in the middle, so that it is not corrupted + // in the tail. + std::vector filenames; + ASSERT_OK(env_->GetChildren(dbname_, &filenames)); + for (const auto& f : filenames) { + uint64_t number; + FileType type; + if (ParseFileName(f, &number, &type) && type == FileType::kLogFile) { + std::string fname = dbname_ + "/" + f; + std::string file_content; + ASSERT_OK(ReadFileToString(env_, fname, &file_content)); + file_content[400] = 'h'; + file_content[401] = 'a'; + ASSERT_OK(WriteStringToFile(env_, file_content, fname)); + break; + } + } + + // Reopen and freeze the file system after the first manifest write. + FaultInjectionTestEnv fit_env(options.env); + options.env = &fit_env; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + test_sync_point, + [&](void* /*arg*/) { fit_env.SetFilesystemActive(false); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_NOK(TryReopenWithColumnFamilies( + {kDefaultColumnFamilyName, "pikachu"}, options)); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + fit_env.SetFilesystemActive(true); + // If we continue using failure ingestion Env, it will conplain something + // when renaming current file, which is not expected. Need to investigate + // why. + options.env = env_; + ASSERT_OK(TryReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, + options)); + } +} + +TEST_F(DBTest2, SeekFileRangeDeleteTail) { + Options options = CurrentOptions(); + options.prefix_extractor.reset(NewCappedPrefixTransform(1)); + options.num_levels = 3; + DestroyAndReopen(options); + + ASSERT_OK(Put("a", "a")); + const Snapshot* s1 = db_->GetSnapshot(); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "f")); + ASSERT_OK(Put("b", "a")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("x", "a")); + ASSERT_OK(Put("z", "a")); + ASSERT_OK(Flush()); + + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 2; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + { + ReadOptions ro; + ro.total_order_seek = true; + std::unique_ptr iter(db_->NewIterator(ro)); + iter->Seek("e"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("x", iter->key().ToString()); + } + db_->ReleaseSnapshot(s1); +} + +TEST_F(DBTest2, BackgroundPurgeTest) { + Options options = CurrentOptions(); + options.write_buffer_manager = + std::make_shared(1 << 20); + options.avoid_unnecessary_blocking_io = true; + DestroyAndReopen(options); + size_t base_value = options.write_buffer_manager->memory_usage(); + + ASSERT_OK(Put("a", "a")); + Iterator* iter = db_->NewIterator(ReadOptions()); + ASSERT_OK(Flush()); + size_t value = options.write_buffer_manager->memory_usage(); + ASSERT_GT(value, base_value); + + db_->GetEnv()->SetBackgroundThreads(1, Env::Priority::HIGH); + test::SleepingBackgroundTask sleeping_task_after; + db_->GetEnv()->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_task_after, Env::Priority::HIGH); + delete iter; + + Env::Default()->SleepForMicroseconds(100000); + value = options.write_buffer_manager->memory_usage(); + ASSERT_GT(value, base_value); + + sleeping_task_after.WakeUp(); + sleeping_task_after.WaitUntilDone(); + + test::SleepingBackgroundTask sleeping_task_after2; + db_->GetEnv()->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_task_after2, Env::Priority::HIGH); + sleeping_task_after2.WakeUp(); + sleeping_task_after2.WaitUntilDone(); + + value = options.write_buffer_manager->memory_usage(); + ASSERT_EQ(base_value, value); +} + +TEST_F(DBTest2, SwitchMemtableRaceWithNewManifest) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + options.max_manifest_file_size = 10; + options.create_if_missing = true; + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_EQ(2, handles_.size()); + + ASSERT_OK(Put("foo", "value")); + const int kL0Files = options.level0_file_num_compaction_trigger; + for (int i = 0; i < kL0Files; ++i) { + ASSERT_OK(Put(/*cf=*/1, "a", std::to_string(i))); + ASSERT_OK(Flush(/*cf=*/1)); + } + + port::Thread thread([&]() { ASSERT_OK(Flush()); }); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + thread.join(); +} + +TEST_F(DBTest2, SameSmallestInSameLevel) { + // This test validates fractional casacading logic when several files at one + // one level only contains the same user key. + Options options = CurrentOptions(); + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + DestroyAndReopen(options); + + ASSERT_OK(Put("key", "1")); + ASSERT_OK(Put("key", "2")); + ASSERT_OK(db_->Merge(WriteOptions(), "key", "3")); + ASSERT_OK(db_->Merge(WriteOptions(), "key", "4")); + Flush(); + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 2; + ASSERT_OK(dbfull()->CompactRange(cro, db_->DefaultColumnFamily(), nullptr, + nullptr)); + + ASSERT_OK(db_->Merge(WriteOptions(), "key", "5")); + Flush(); + ASSERT_OK(db_->Merge(WriteOptions(), "key", "6")); + Flush(); + ASSERT_OK(db_->Merge(WriteOptions(), "key", "7")); + Flush(); + ASSERT_OK(db_->Merge(WriteOptions(), "key", "8")); + Flush(); + dbfull()->TEST_WaitForCompact(true); +#ifndef ROCKSDB_LITE + ASSERT_EQ("0,4,1", FilesPerLevel()); +#endif // ROCKSDB_LITE + + ASSERT_EQ("2,3,4,5,6,7,8", Get("key")); +} + +TEST_F(DBTest2, BlockBasedTablePrefixIndexSeekForPrev) { + // create a DB with block prefix index + BlockBasedTableOptions table_options; + Options options = CurrentOptions(); + table_options.block_size = 300; + table_options.index_type = BlockBasedTableOptions::kHashSearch; + table_options.index_shortening = + BlockBasedTableOptions::IndexShorteningMode::kNoShortening; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + + Reopen(options); + + Random rnd(301); + std::string large_value = RandomString(&rnd, 500); + + ASSERT_OK(Put("a1", large_value)); + ASSERT_OK(Put("x1", large_value)); + ASSERT_OK(Put("y1", large_value)); + Flush(); + + { + std::unique_ptr iterator(db_->NewIterator(ReadOptions())); + iterator->SeekForPrev("x3"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("x1", iterator->key().ToString()); + + iterator->SeekForPrev("a3"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("a1", iterator->key().ToString()); + + iterator->SeekForPrev("y3"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("y1", iterator->key().ToString()); + + // Query more than one non-existing prefix to cover the case both + // of empty hash bucket and hash bucket conflict. + iterator->SeekForPrev("b1"); + // Result should be not valid or "a1". + if (iterator->Valid()) { + ASSERT_EQ("a1", iterator->key().ToString()); + } + + iterator->SeekForPrev("c1"); + // Result should be not valid or "a1". + if (iterator->Valid()) { + ASSERT_EQ("a1", iterator->key().ToString()); + } + + iterator->SeekForPrev("d1"); + // Result should be not valid or "a1". + if (iterator->Valid()) { + ASSERT_EQ("a1", iterator->key().ToString()); + } + + iterator->SeekForPrev("y3"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("y1", iterator->key().ToString()); + } +} + +TEST_F(DBTest2, ChangePrefixExtractor) { + for (bool use_partitioned_filter : {true, false}) { + // create a DB with block prefix index + BlockBasedTableOptions table_options; + Options options = CurrentOptions(); + + // Sometimes filter is checked based on upper bound. Assert counters + // for that case. Otherwise, only check data correctness. +#ifndef ROCKSDB_LITE + bool expect_filter_check = !use_partitioned_filter; +#else + bool expect_filter_check = false; +#endif + table_options.partition_filters = use_partitioned_filter; + if (use_partitioned_filter) { + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + } + table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); + + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.statistics = CreateDBStatistics(); + + options.prefix_extractor.reset(NewFixedPrefixTransform(2)); + DestroyAndReopen(options); + + Random rnd(301); + + ASSERT_OK(Put("aa", "")); + ASSERT_OK(Put("xb", "")); + ASSERT_OK(Put("xx1", "")); + ASSERT_OK(Put("xz1", "")); + ASSERT_OK(Put("zz", "")); + Flush(); + + // After reopening DB with prefix size 2 => 1, prefix extractor + // won't take effective unless it won't change results based + // on upper bound and seek key. + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + Reopen(options); + + { + std::unique_ptr iterator(db_->NewIterator(ReadOptions())); + iterator->Seek("xa"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("xb", iterator->key().ToString()); + // It's a bug that the counter BLOOM_FILTER_PREFIX_CHECKED is not + // correct in this case. So don't check counters in this case. + if (expect_filter_check) { + ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + } + + iterator->Seek("xz"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("xz1", iterator->key().ToString()); + if (expect_filter_check) { + ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + } + } + + std::string ub_str = "xg9"; + Slice ub(ub_str); + ReadOptions ro; + ro.iterate_upper_bound = &ub; + + { + std::unique_ptr iterator(db_->NewIterator(ro)); + + // SeekForPrev() never uses prefix bloom if it is changed. + iterator->SeekForPrev("xg0"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("xb", iterator->key().ToString()); + if (expect_filter_check) { + ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + } + } + + ub_str = "xx9"; + ub = Slice(ub_str); + { + std::unique_ptr iterator(db_->NewIterator(ro)); + + iterator->Seek("x"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("xb", iterator->key().ToString()); + if (expect_filter_check) { + ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + } + + iterator->Seek("xx0"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("xx1", iterator->key().ToString()); + if (expect_filter_check) { + ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + } + } + + CompactRangeOptions compact_range_opts; + compact_range_opts.bottommost_level_compaction = + BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr)); + ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr)); + + // Re-execute similar queries after a full compaction + { + std::unique_ptr iterator(db_->NewIterator(ReadOptions())); + + iterator->Seek("x"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("xb", iterator->key().ToString()); + if (expect_filter_check) { + ASSERT_EQ(2, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + } + + iterator->Seek("xg"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("xx1", iterator->key().ToString()); + if (expect_filter_check) { + ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + } + + iterator->Seek("xz"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("xz1", iterator->key().ToString()); + if (expect_filter_check) { + ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + } + } + { + std::unique_ptr iterator(db_->NewIterator(ro)); + + iterator->SeekForPrev("xx0"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("xb", iterator->key().ToString()); + if (expect_filter_check) { + ASSERT_EQ(5, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + } + + iterator->Seek("xx0"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("xx1", iterator->key().ToString()); + if (expect_filter_check) { + ASSERT_EQ(6, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + } + } + + ub_str = "xg9"; + ub = Slice(ub_str); + { + std::unique_ptr iterator(db_->NewIterator(ro)); + iterator->SeekForPrev("xg0"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("xb", iterator->key().ToString()); + if (expect_filter_check) { + ASSERT_EQ(7, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + } + } + } +} + +TEST_F(DBTest2, BlockBasedTablePrefixGetIndexNotFound) { + // create a DB with block prefix index + BlockBasedTableOptions table_options; + Options options = CurrentOptions(); + table_options.block_size = 300; + table_options.index_type = BlockBasedTableOptions::kHashSearch; + table_options.index_shortening = + BlockBasedTableOptions::IndexShorteningMode::kNoShortening; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.level0_file_num_compaction_trigger = 8; + + Reopen(options); + + ASSERT_OK(Put("b1", "ok")); + Flush(); + + // Flushing several files so that the chance that hash bucket + // is empty fo "b" in at least one of the files is high. + ASSERT_OK(Put("a1", "")); + ASSERT_OK(Put("c1", "")); + Flush(); + + ASSERT_OK(Put("a2", "")); + ASSERT_OK(Put("c2", "")); + Flush(); + + ASSERT_OK(Put("a3", "")); + ASSERT_OK(Put("c3", "")); + Flush(); + + ASSERT_OK(Put("a4", "")); + ASSERT_OK(Put("c4", "")); + Flush(); + + ASSERT_OK(Put("a5", "")); + ASSERT_OK(Put("c5", "")); + Flush(); + + ASSERT_EQ("ok", Get("b1")); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBTest2, AutoPrefixMode1) { + // create a DB with block prefix index + BlockBasedTableOptions table_options; + Options options = CurrentOptions(); + table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.statistics = CreateDBStatistics(); + + Reopen(options); + + Random rnd(301); + std::string large_value = RandomString(&rnd, 500); + + ASSERT_OK(Put("a1", large_value)); + ASSERT_OK(Put("x1", large_value)); + ASSERT_OK(Put("y1", large_value)); + Flush(); + + ReadOptions ro; + ro.total_order_seek = false; + ro.auto_prefix_mode = true; + { + std::unique_ptr iterator(db_->NewIterator(ro)); + iterator->Seek("b1"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("x1", iterator->key().ToString()); + ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + } + + std::string ub_str = "b9"; + Slice ub(ub_str); + ro.iterate_upper_bound = &ub; + + { + std::unique_ptr iterator(db_->NewIterator(ro)); + iterator->Seek("b1"); + ASSERT_FALSE(iterator->Valid()); + ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + } + + ub_str = "z"; + ub = Slice(ub_str); + { + std::unique_ptr iterator(db_->NewIterator(ro)); + iterator->Seek("b1"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("x1", iterator->key().ToString()); + ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + } + + ub_str = "c"; + ub = Slice(ub_str); + { + std::unique_ptr iterator(db_->NewIterator(ro)); + iterator->Seek("b1"); + ASSERT_FALSE(iterator->Valid()); + ASSERT_EQ(2, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + } + + // The same queries without recreating iterator + { + ub_str = "b9"; + ub = Slice(ub_str); + ro.iterate_upper_bound = &ub; + + std::unique_ptr iterator(db_->NewIterator(ro)); + iterator->Seek("b1"); + ASSERT_FALSE(iterator->Valid()); + ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + + ub_str = "z"; + ub = Slice(ub_str); + + iterator->Seek("b1"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("x1", iterator->key().ToString()); + ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + + ub_str = "c"; + ub = Slice(ub_str); + + iterator->Seek("b1"); + ASSERT_FALSE(iterator->Valid()); + ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + + ub_str = "b9"; + ub = Slice(ub_str); + ro.iterate_upper_bound = &ub; + iterator->SeekForPrev("b1"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("a1", iterator->key().ToString()); + ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + + ub_str = "zz"; + ub = Slice(ub_str); + ro.iterate_upper_bound = &ub; + iterator->SeekToLast(); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("y1", iterator->key().ToString()); + + iterator->SeekToFirst(); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("a1", iterator->key().ToString()); + } +} +#endif // ROCKSDB_LITE +} // namespace ROCKSDB_NAMESPACE + +#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS +extern "C" { +void RegisterCustomObjects(int argc, char** argv); +} +#else +void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {} +#endif // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_test_util.cc b/src/rocksdb/db/db_test_util.cc new file mode 100644 index 000000000..c73abde41 --- /dev/null +++ b/src/rocksdb/db/db_test_util.cc @@ -0,0 +1,1564 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_test_util.h" +#include "db/forward_iterator.h" +#include "rocksdb/env_encryption.h" +#include "rocksdb/utilities/object_registry.h" + +namespace ROCKSDB_NAMESPACE { + +// Special Env used to delay background operations + +SpecialEnv::SpecialEnv(Env* base) + : EnvWrapper(base), + rnd_(301), + sleep_counter_(this), + addon_time_(0), + time_elapse_only_sleep_(false), + no_slowdown_(false) { + delay_sstable_sync_.store(false, std::memory_order_release); + drop_writes_.store(false, std::memory_order_release); + no_space_.store(false, std::memory_order_release); + non_writable_.store(false, std::memory_order_release); + count_random_reads_ = false; + count_sequential_reads_ = false; + manifest_sync_error_.store(false, std::memory_order_release); + manifest_write_error_.store(false, std::memory_order_release); + log_write_error_.store(false, std::memory_order_release); + random_file_open_counter_.store(0, std::memory_order_relaxed); + delete_count_.store(0, std::memory_order_relaxed); + num_open_wal_file_.store(0); + log_write_slowdown_ = 0; + bytes_written_ = 0; + sync_counter_ = 0; + non_writeable_rate_ = 0; + new_writable_count_ = 0; + non_writable_count_ = 0; + table_write_callback_ = nullptr; +} +#ifndef ROCKSDB_LITE +ROT13BlockCipher rot13Cipher_(16); +#endif // ROCKSDB_LITE + +DBTestBase::DBTestBase(const std::string path) + : mem_env_(nullptr), encrypted_env_(nullptr), option_config_(kDefault) { + Env* base_env = Env::Default(); +#ifndef ROCKSDB_LITE + const char* test_env_uri = getenv("TEST_ENV_URI"); + if (test_env_uri) { + Env* test_env = nullptr; + Status s = Env::LoadEnv(test_env_uri, &test_env, &env_guard_); + base_env = test_env; + EXPECT_OK(s); + EXPECT_NE(Env::Default(), base_env); + } +#endif // !ROCKSDB_LITE + EXPECT_NE(nullptr, base_env); + if (getenv("MEM_ENV")) { + mem_env_ = new MockEnv(base_env); + } +#ifndef ROCKSDB_LITE + if (getenv("ENCRYPTED_ENV")) { + encrypted_env_ = NewEncryptedEnv(mem_env_ ? mem_env_ : base_env, + new CTREncryptionProvider(rot13Cipher_)); + } +#endif // !ROCKSDB_LITE + env_ = new SpecialEnv(encrypted_env_ ? encrypted_env_ + : (mem_env_ ? mem_env_ : base_env)); + env_->SetBackgroundThreads(1, Env::LOW); + env_->SetBackgroundThreads(1, Env::HIGH); + dbname_ = test::PerThreadDBPath(env_, path); + alternative_wal_dir_ = dbname_ + "/wal"; + alternative_db_log_dir_ = dbname_ + "/db_log_dir"; + auto options = CurrentOptions(); + options.env = env_; + auto delete_options = options; + delete_options.wal_dir = alternative_wal_dir_; + EXPECT_OK(DestroyDB(dbname_, delete_options)); + // Destroy it for not alternative WAL dir is used. + EXPECT_OK(DestroyDB(dbname_, options)); + db_ = nullptr; + Reopen(options); + Random::GetTLSInstance()->Reset(0xdeadbeef); +} + +DBTestBase::~DBTestBase() { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + Close(); + Options options; + options.db_paths.emplace_back(dbname_, 0); + options.db_paths.emplace_back(dbname_ + "_2", 0); + options.db_paths.emplace_back(dbname_ + "_3", 0); + options.db_paths.emplace_back(dbname_ + "_4", 0); + options.env = env_; + + if (getenv("KEEP_DB")) { + printf("DB is still at %s\n", dbname_.c_str()); + } else { + EXPECT_OK(DestroyDB(dbname_, options)); + } + delete env_; +} + +bool DBTestBase::ShouldSkipOptions(int option_config, int skip_mask) { +#ifdef ROCKSDB_LITE + // These options are not supported in ROCKSDB_LITE + if (option_config == kHashSkipList || + option_config == kPlainTableFirstBytePrefix || + option_config == kPlainTableCappedPrefix || + option_config == kPlainTableCappedPrefixNonMmap || + option_config == kPlainTableAllBytesPrefix || + option_config == kVectorRep || option_config == kHashLinkList || + option_config == kUniversalCompaction || + option_config == kUniversalCompactionMultiLevel || + option_config == kUniversalSubcompactions || + option_config == kFIFOCompaction || + option_config == kConcurrentSkipList) { + return true; + } +#endif + + if ((skip_mask & kSkipUniversalCompaction) && + (option_config == kUniversalCompaction || + option_config == kUniversalCompactionMultiLevel || + option_config == kUniversalSubcompactions)) { + return true; + } + if ((skip_mask & kSkipMergePut) && option_config == kMergePut) { + return true; + } + if ((skip_mask & kSkipNoSeekToLast) && + (option_config == kHashLinkList || option_config == kHashSkipList)) { + return true; + } + if ((skip_mask & kSkipPlainTable) && + (option_config == kPlainTableAllBytesPrefix || + option_config == kPlainTableFirstBytePrefix || + option_config == kPlainTableCappedPrefix || + option_config == kPlainTableCappedPrefixNonMmap)) { + return true; + } + if ((skip_mask & kSkipHashIndex) && + (option_config == kBlockBasedTableWithPrefixHashIndex || + option_config == kBlockBasedTableWithWholeKeyHashIndex)) { + return true; + } + if ((skip_mask & kSkipFIFOCompaction) && option_config == kFIFOCompaction) { + return true; + } + if ((skip_mask & kSkipMmapReads) && option_config == kWalDirAndMmapReads) { + return true; + } + return false; +} + +// Switch to a fresh database with the next option configuration to +// test. Return false if there are no more configurations to test. +bool DBTestBase::ChangeOptions(int skip_mask) { + for (option_config_++; option_config_ < kEnd; option_config_++) { + if (ShouldSkipOptions(option_config_, skip_mask)) { + continue; + } + break; + } + + if (option_config_ >= kEnd) { + Destroy(last_options_); + return false; + } else { + auto options = CurrentOptions(); + options.create_if_missing = true; + DestroyAndReopen(options); + return true; + } +} + +// Switch between different compaction styles. +bool DBTestBase::ChangeCompactOptions() { + if (option_config_ == kDefault) { + option_config_ = kUniversalCompaction; + Destroy(last_options_); + auto options = CurrentOptions(); + options.create_if_missing = true; + TryReopen(options); + return true; + } else if (option_config_ == kUniversalCompaction) { + option_config_ = kUniversalCompactionMultiLevel; + Destroy(last_options_); + auto options = CurrentOptions(); + options.create_if_missing = true; + TryReopen(options); + return true; + } else if (option_config_ == kUniversalCompactionMultiLevel) { + option_config_ = kLevelSubcompactions; + Destroy(last_options_); + auto options = CurrentOptions(); + assert(options.max_subcompactions > 1); + TryReopen(options); + return true; + } else if (option_config_ == kLevelSubcompactions) { + option_config_ = kUniversalSubcompactions; + Destroy(last_options_); + auto options = CurrentOptions(); + assert(options.max_subcompactions > 1); + TryReopen(options); + return true; + } else { + return false; + } +} + +// Switch between different WAL settings +bool DBTestBase::ChangeWalOptions() { + if (option_config_ == kDefault) { + option_config_ = kDBLogDir; + Destroy(last_options_); + auto options = CurrentOptions(); + Destroy(options); + options.create_if_missing = true; + TryReopen(options); + return true; + } else if (option_config_ == kDBLogDir) { + option_config_ = kWalDirAndMmapReads; + Destroy(last_options_); + auto options = CurrentOptions(); + Destroy(options); + options.create_if_missing = true; + TryReopen(options); + return true; + } else if (option_config_ == kWalDirAndMmapReads) { + option_config_ = kRecycleLogFiles; + Destroy(last_options_); + auto options = CurrentOptions(); + Destroy(options); + TryReopen(options); + return true; + } else { + return false; + } +} + +// Switch between different filter policy +// Jump from kDefault to kFilter to kFullFilter +bool DBTestBase::ChangeFilterOptions() { + if (option_config_ == kDefault) { + option_config_ = kFilter; + } else if (option_config_ == kFilter) { + option_config_ = kFullFilterWithNewTableReaderForCompactions; + } else if (option_config_ == kFullFilterWithNewTableReaderForCompactions) { + option_config_ = kPartitionedFilterWithNewTableReaderForCompactions; + } else { + return false; + } + Destroy(last_options_); + + auto options = CurrentOptions(); + options.create_if_missing = true; + TryReopen(options); + return true; +} + +// Switch between different DB options for file ingestion tests. +bool DBTestBase::ChangeOptionsForFileIngestionTest() { + if (option_config_ == kDefault) { + option_config_ = kUniversalCompaction; + Destroy(last_options_); + auto options = CurrentOptions(); + options.create_if_missing = true; + TryReopen(options); + return true; + } else if (option_config_ == kUniversalCompaction) { + option_config_ = kUniversalCompactionMultiLevel; + Destroy(last_options_); + auto options = CurrentOptions(); + options.create_if_missing = true; + TryReopen(options); + return true; + } else if (option_config_ == kUniversalCompactionMultiLevel) { + option_config_ = kLevelSubcompactions; + Destroy(last_options_); + auto options = CurrentOptions(); + assert(options.max_subcompactions > 1); + TryReopen(options); + return true; + } else if (option_config_ == kLevelSubcompactions) { + option_config_ = kUniversalSubcompactions; + Destroy(last_options_); + auto options = CurrentOptions(); + assert(options.max_subcompactions > 1); + TryReopen(options); + return true; + } else if (option_config_ == kUniversalSubcompactions) { + option_config_ = kDirectIO; + Destroy(last_options_); + auto options = CurrentOptions(); + TryReopen(options); + return true; + } else { + return false; + } +} + +// Return the current option configuration. +Options DBTestBase::CurrentOptions( + const anon::OptionsOverride& options_override) const { + return GetOptions(option_config_, GetDefaultOptions(), options_override); +} + +Options DBTestBase::CurrentOptions( + const Options& default_options, + const anon::OptionsOverride& options_override) const { + return GetOptions(option_config_, default_options, options_override); +} + +Options DBTestBase::GetDefaultOptions() { + Options options; + options.write_buffer_size = 4090 * 4096; + options.target_file_size_base = 2 * 1024 * 1024; + options.max_bytes_for_level_base = 10 * 1024 * 1024; + options.max_open_files = 5000; + options.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords; + options.compaction_pri = CompactionPri::kByCompensatedSize; + return options; +} + +Options DBTestBase::GetOptions( + int option_config, const Options& default_options, + const anon::OptionsOverride& options_override) const { + // this redundant copy is to minimize code change w/o having lint error. + Options options = default_options; + BlockBasedTableOptions table_options; + bool set_block_based_table_factory = true; +#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \ + !defined(OS_AIX) + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( + "NewRandomAccessFile:O_DIRECT"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( + "NewWritableFile:O_DIRECT"); +#endif + + bool can_allow_mmap = IsMemoryMappedAccessSupported(); + switch (option_config) { +#ifndef ROCKSDB_LITE + case kHashSkipList: + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.memtable_factory.reset(NewHashSkipListRepFactory(16)); + options.allow_concurrent_memtable_write = false; + options.unordered_write = false; + break; + case kPlainTableFirstBytePrefix: + options.table_factory.reset(new PlainTableFactory()); + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.allow_mmap_reads = can_allow_mmap; + options.max_sequential_skip_in_iterations = 999999; + set_block_based_table_factory = false; + break; + case kPlainTableCappedPrefix: + options.table_factory.reset(new PlainTableFactory()); + options.prefix_extractor.reset(NewCappedPrefixTransform(8)); + options.allow_mmap_reads = can_allow_mmap; + options.max_sequential_skip_in_iterations = 999999; + set_block_based_table_factory = false; + break; + case kPlainTableCappedPrefixNonMmap: + options.table_factory.reset(new PlainTableFactory()); + options.prefix_extractor.reset(NewCappedPrefixTransform(8)); + options.allow_mmap_reads = false; + options.max_sequential_skip_in_iterations = 999999; + set_block_based_table_factory = false; + break; + case kPlainTableAllBytesPrefix: + options.table_factory.reset(new PlainTableFactory()); + options.prefix_extractor.reset(NewNoopTransform()); + options.allow_mmap_reads = can_allow_mmap; + options.max_sequential_skip_in_iterations = 999999; + set_block_based_table_factory = false; + break; + case kVectorRep: + options.memtable_factory.reset(new VectorRepFactory(100)); + options.allow_concurrent_memtable_write = false; + options.unordered_write = false; + break; + case kHashLinkList: + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.memtable_factory.reset( + NewHashLinkListRepFactory(4, 0, 3, true, 4)); + options.allow_concurrent_memtable_write = false; + options.unordered_write = false; + break; + case kDirectIO: { + options.use_direct_reads = true; + options.use_direct_io_for_flush_and_compaction = true; + options.compaction_readahead_size = 2 * 1024 * 1024; + #if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \ + !defined(OS_AIX) && !defined(OS_OPENBSD) + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "NewWritableFile:O_DIRECT", [&](void* arg) { + int* val = static_cast(arg); + *val &= ~O_DIRECT; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "NewRandomAccessFile:O_DIRECT", [&](void* arg) { + int* val = static_cast(arg); + *val &= ~O_DIRECT; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); +#endif + break; + } +#endif // ROCKSDB_LITE + case kMergePut: + options.merge_operator = MergeOperators::CreatePutOperator(); + break; + case kFilter: + table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); + break; + case kFullFilterWithNewTableReaderForCompactions: + table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); + options.new_table_reader_for_compaction_inputs = true; + options.compaction_readahead_size = 10 * 1024 * 1024; + break; + case kPartitionedFilterWithNewTableReaderForCompactions: + table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); + table_options.partition_filters = true; + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + options.new_table_reader_for_compaction_inputs = true; + options.compaction_readahead_size = 10 * 1024 * 1024; + break; + case kUncompressed: + options.compression = kNoCompression; + break; + case kNumLevel_3: + options.num_levels = 3; + break; + case kDBLogDir: + options.db_log_dir = alternative_db_log_dir_; + break; + case kWalDirAndMmapReads: + options.wal_dir = alternative_wal_dir_; + // mmap reads should be orthogonal to WalDir setting, so we piggyback to + // this option config to test mmap reads as well + options.allow_mmap_reads = can_allow_mmap; + break; + case kManifestFileSize: + options.max_manifest_file_size = 50; // 50 bytes + break; + case kPerfOptions: + options.soft_rate_limit = 2.0; + options.delayed_write_rate = 8 * 1024 * 1024; + options.report_bg_io_stats = true; + // TODO(3.13) -- test more options + break; + case kUniversalCompaction: + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 1; + break; + case kUniversalCompactionMultiLevel: + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 8; + break; + case kCompressedBlockCache: + options.allow_mmap_writes = can_allow_mmap; + table_options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024); + break; + case kInfiniteMaxOpenFiles: + options.max_open_files = -1; + break; + case kxxHashChecksum: { + table_options.checksum = kxxHash; + break; + } + case kxxHash64Checksum: { + table_options.checksum = kxxHash64; + break; + } + case kFIFOCompaction: { + options.compaction_style = kCompactionStyleFIFO; + break; + } + case kBlockBasedTableWithPrefixHashIndex: { + table_options.index_type = BlockBasedTableOptions::kHashSearch; + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + break; + } + case kBlockBasedTableWithWholeKeyHashIndex: { + table_options.index_type = BlockBasedTableOptions::kHashSearch; + options.prefix_extractor.reset(NewNoopTransform()); + break; + } + case kBlockBasedTableWithPartitionedIndex: { + table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch; + options.prefix_extractor.reset(NewNoopTransform()); + break; + } + case kBlockBasedTableWithPartitionedIndexFormat4: { + table_options.format_version = 4; + // Format 4 changes the binary index format. Since partitioned index is a + // super-set of simple indexes, we are also using kTwoLevelIndexSearch to + // test this format. + table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch; + // The top-level index in partition filters are also affected by format 4. + table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); + table_options.partition_filters = true; + table_options.index_block_restart_interval = 8; + break; + } + case kBlockBasedTableWithIndexRestartInterval: { + table_options.index_block_restart_interval = 8; + break; + } + case kOptimizeFiltersForHits: { + options.optimize_filters_for_hits = true; + set_block_based_table_factory = true; + break; + } + case kRowCache: { + options.row_cache = NewLRUCache(1024 * 1024); + break; + } + case kRecycleLogFiles: { + options.recycle_log_file_num = 2; + break; + } + case kLevelSubcompactions: { + options.max_subcompactions = 4; + break; + } + case kUniversalSubcompactions: { + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 8; + options.max_subcompactions = 4; + break; + } + case kConcurrentSkipList: { + options.allow_concurrent_memtable_write = true; + options.enable_write_thread_adaptive_yield = true; + break; + } + case kPipelinedWrite: { + options.enable_pipelined_write = true; + break; + } + case kConcurrentWALWrites: { + // This options optimize 2PC commit path + options.two_write_queues = true; + options.manual_wal_flush = true; + break; + } + case kUnorderedWrite: { + options.allow_concurrent_memtable_write = false; + options.unordered_write = false; + break; + } + + default: + break; + } + + if (options_override.filter_policy) { + table_options.filter_policy = options_override.filter_policy; + table_options.partition_filters = options_override.partition_filters; + table_options.metadata_block_size = options_override.metadata_block_size; + } + if (set_block_based_table_factory) { + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + } + options.env = env_; + options.create_if_missing = true; + options.fail_if_options_file_error = true; + return options; +} + +void DBTestBase::CreateColumnFamilies(const std::vector& cfs, + const Options& options) { + ColumnFamilyOptions cf_opts(options); + size_t cfi = handles_.size(); + handles_.resize(cfi + cfs.size()); + for (auto cf : cfs) { + Status s = db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++]); + ASSERT_OK(s); + } +} + +void DBTestBase::CreateAndReopenWithCF(const std::vector& cfs, + const Options& options) { + CreateColumnFamilies(cfs, options); + std::vector cfs_plus_default = cfs; + cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName); + ReopenWithColumnFamilies(cfs_plus_default, options); +} + +void DBTestBase::ReopenWithColumnFamilies(const std::vector& cfs, + const std::vector& options) { + ASSERT_OK(TryReopenWithColumnFamilies(cfs, options)); +} + +void DBTestBase::ReopenWithColumnFamilies(const std::vector& cfs, + const Options& options) { + ASSERT_OK(TryReopenWithColumnFamilies(cfs, options)); +} + +Status DBTestBase::TryReopenWithColumnFamilies( + const std::vector& cfs, const std::vector& options) { + Close(); + EXPECT_EQ(cfs.size(), options.size()); + std::vector column_families; + for (size_t i = 0; i < cfs.size(); ++i) { + column_families.push_back(ColumnFamilyDescriptor(cfs[i], options[i])); + } + DBOptions db_opts = DBOptions(options[0]); + last_options_ = options[0]; + return DB::Open(db_opts, dbname_, column_families, &handles_, &db_); +} + +Status DBTestBase::TryReopenWithColumnFamilies( + const std::vector& cfs, const Options& options) { + Close(); + std::vector v_opts(cfs.size(), options); + return TryReopenWithColumnFamilies(cfs, v_opts); +} + +void DBTestBase::Reopen(const Options& options) { + ASSERT_OK(TryReopen(options)); +} + +void DBTestBase::Close() { + for (auto h : handles_) { + db_->DestroyColumnFamilyHandle(h); + } + handles_.clear(); + delete db_; + db_ = nullptr; +} + +void DBTestBase::DestroyAndReopen(const Options& options) { + // Destroy using last options + Destroy(last_options_); + ASSERT_OK(TryReopen(options)); +} + +void DBTestBase::Destroy(const Options& options, bool delete_cf_paths) { + std::vector column_families; + if (delete_cf_paths) { + for (size_t i = 0; i < handles_.size(); ++i) { + ColumnFamilyDescriptor cfdescriptor; + handles_[i]->GetDescriptor(&cfdescriptor); + column_families.push_back(cfdescriptor); + } + } + Close(); + ASSERT_OK(DestroyDB(dbname_, options, column_families)); +} + +Status DBTestBase::ReadOnlyReopen(const Options& options) { + return DB::OpenForReadOnly(options, dbname_, &db_); +} + +Status DBTestBase::TryReopen(const Options& options) { + Close(); + last_options_.table_factory.reset(); + // Note: operator= is an unsafe approach here since it destructs + // std::shared_ptr in the same order of their creation, in contrast to + // destructors which destructs them in the opposite order of creation. One + // particular problme is that the cache destructor might invoke callback + // functions that use Option members such as statistics. To work around this + // problem, we manually call destructor of table_facotry which eventually + // clears the block cache. + last_options_ = options; + return DB::Open(options, dbname_, &db_); +} + +bool DBTestBase::IsDirectIOSupported() { + return test::IsDirectIOSupported(env_, dbname_); +} + +bool DBTestBase::IsMemoryMappedAccessSupported() const { + return (!encrypted_env_); +} + +Status DBTestBase::Flush(int cf) { + if (cf == 0) { + return db_->Flush(FlushOptions()); + } else { + return db_->Flush(FlushOptions(), handles_[cf]); + } +} + +Status DBTestBase::Flush(const std::vector& cf_ids) { + std::vector cfhs; + std::for_each(cf_ids.begin(), cf_ids.end(), + [&cfhs, this](int id) { cfhs.emplace_back(handles_[id]); }); + return db_->Flush(FlushOptions(), cfhs); +} + +Status DBTestBase::Put(const Slice& k, const Slice& v, WriteOptions wo) { + if (kMergePut == option_config_) { + return db_->Merge(wo, k, v); + } else { + return db_->Put(wo, k, v); + } +} + +Status DBTestBase::Put(int cf, const Slice& k, const Slice& v, + WriteOptions wo) { + if (kMergePut == option_config_) { + return db_->Merge(wo, handles_[cf], k, v); + } else { + return db_->Put(wo, handles_[cf], k, v); + } +} + +Status DBTestBase::Merge(const Slice& k, const Slice& v, WriteOptions wo) { + return db_->Merge(wo, k, v); +} + +Status DBTestBase::Merge(int cf, const Slice& k, const Slice& v, + WriteOptions wo) { + return db_->Merge(wo, handles_[cf], k, v); +} + +Status DBTestBase::Delete(const std::string& k) { + return db_->Delete(WriteOptions(), k); +} + +Status DBTestBase::Delete(int cf, const std::string& k) { + return db_->Delete(WriteOptions(), handles_[cf], k); +} + +Status DBTestBase::SingleDelete(const std::string& k) { + return db_->SingleDelete(WriteOptions(), k); +} + +Status DBTestBase::SingleDelete(int cf, const std::string& k) { + return db_->SingleDelete(WriteOptions(), handles_[cf], k); +} + +bool DBTestBase::SetPreserveDeletesSequenceNumber(SequenceNumber sn) { + return db_->SetPreserveDeletesSequenceNumber(sn); +} + +std::string DBTestBase::Get(const std::string& k, const Snapshot* snapshot) { + ReadOptions options; + options.verify_checksums = true; + options.snapshot = snapshot; + std::string result; + Status s = db_->Get(options, k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; +} + +std::string DBTestBase::Get(int cf, const std::string& k, + const Snapshot* snapshot) { + ReadOptions options; + options.verify_checksums = true; + options.snapshot = snapshot; + std::string result; + Status s = db_->Get(options, handles_[cf], k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; +} + +std::vector DBTestBase::MultiGet(std::vector cfs, + const std::vector& k, + const Snapshot* snapshot, + const bool batched) { + ReadOptions options; + options.verify_checksums = true; + options.snapshot = snapshot; + std::vector handles; + std::vector keys; + std::vector result; + + for (unsigned int i = 0; i < cfs.size(); ++i) { + handles.push_back(handles_[cfs[i]]); + keys.push_back(k[i]); + } + std::vector s; + if (!batched) { + s = db_->MultiGet(options, handles, keys, &result); + for (unsigned int i = 0; i < s.size(); ++i) { + if (s[i].IsNotFound()) { + result[i] = "NOT_FOUND"; + } else if (!s[i].ok()) { + result[i] = s[i].ToString(); + } + } + } else { + std::vector pin_values(cfs.size()); + result.resize(cfs.size()); + s.resize(cfs.size()); + db_->MultiGet(options, cfs.size(), handles.data(), keys.data(), + pin_values.data(), s.data()); + for (unsigned int i = 0; i < s.size(); ++i) { + if (s[i].IsNotFound()) { + result[i] = "NOT_FOUND"; + } else if (!s[i].ok()) { + result[i] = s[i].ToString(); + } else { + result[i].assign(pin_values[i].data(), pin_values[i].size()); + } + } + } + return result; +} + +std::vector DBTestBase::MultiGet(const std::vector& k, + const Snapshot* snapshot) { + ReadOptions options; + options.verify_checksums = true; + options.snapshot = snapshot; + std::vector keys; + std::vector result; + std::vector statuses(k.size()); + std::vector pin_values(k.size()); + + for (unsigned int i = 0; i < k.size(); ++i) { + keys.push_back(k[i]); + } + db_->MultiGet(options, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), pin_values.data(), statuses.data()); + result.resize(k.size()); + for (auto iter = result.begin(); iter != result.end(); ++iter) { + iter->assign(pin_values[iter - result.begin()].data(), + pin_values[iter - result.begin()].size()); + } + for (unsigned int i = 0; i < statuses.size(); ++i) { + if (statuses[i].IsNotFound()) { + result[i] = "NOT_FOUND"; + } + } + return result; +} + +Status DBTestBase::Get(const std::string& k, PinnableSlice* v) { + ReadOptions options; + options.verify_checksums = true; + Status s = dbfull()->Get(options, dbfull()->DefaultColumnFamily(), k, v); + return s; +} + +uint64_t DBTestBase::GetNumSnapshots() { + uint64_t int_num; + EXPECT_TRUE(dbfull()->GetIntProperty("rocksdb.num-snapshots", &int_num)); + return int_num; +} + +uint64_t DBTestBase::GetTimeOldestSnapshots() { + uint64_t int_num; + EXPECT_TRUE( + dbfull()->GetIntProperty("rocksdb.oldest-snapshot-time", &int_num)); + return int_num; +} + +uint64_t DBTestBase::GetSequenceOldestSnapshots() { + uint64_t int_num; + EXPECT_TRUE( + dbfull()->GetIntProperty("rocksdb.oldest-snapshot-sequence", &int_num)); + return int_num; +} + +// Return a string that contains all key,value pairs in order, +// formatted like "(k1->v1)(k2->v2)". +std::string DBTestBase::Contents(int cf) { + std::vector forward; + std::string result; + Iterator* iter = (cf == 0) ? db_->NewIterator(ReadOptions()) + : db_->NewIterator(ReadOptions(), handles_[cf]); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + std::string s = IterStatus(iter); + result.push_back('('); + result.append(s); + result.push_back(')'); + forward.push_back(s); + } + + // Check reverse iteration results are the reverse of forward results + unsigned int matched = 0; + for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { + EXPECT_LT(matched, forward.size()); + EXPECT_EQ(IterStatus(iter), forward[forward.size() - matched - 1]); + matched++; + } + EXPECT_EQ(matched, forward.size()); + + delete iter; + return result; +} + +std::string DBTestBase::AllEntriesFor(const Slice& user_key, int cf) { + Arena arena; + auto options = CurrentOptions(); + InternalKeyComparator icmp(options.comparator); + ReadRangeDelAggregator range_del_agg(&icmp, + kMaxSequenceNumber /* upper_bound */); + ScopedArenaIterator iter; + if (cf == 0) { + iter.set(dbfull()->NewInternalIterator(&arena, &range_del_agg, + kMaxSequenceNumber)); + } else { + iter.set(dbfull()->NewInternalIterator(&arena, &range_del_agg, + kMaxSequenceNumber, handles_[cf])); + } + InternalKey target(user_key, kMaxSequenceNumber, kTypeValue); + iter->Seek(target.Encode()); + std::string result; + if (!iter->status().ok()) { + result = iter->status().ToString(); + } else { + result = "[ "; + bool first = true; + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + if (!ParseInternalKey(iter->key(), &ikey)) { + result += "CORRUPTED"; + } else { + if (!last_options_.comparator->Equal(ikey.user_key, user_key)) { + break; + } + if (!first) { + result += ", "; + } + first = false; + switch (ikey.type) { + case kTypeValue: + result += iter->value().ToString(); + break; + case kTypeMerge: + // keep it the same as kTypeValue for testing kMergePut + result += iter->value().ToString(); + break; + case kTypeDeletion: + result += "DEL"; + break; + case kTypeSingleDeletion: + result += "SDEL"; + break; + default: + assert(false); + break; + } + } + iter->Next(); + } + if (!first) { + result += " "; + } + result += "]"; + } + return result; +} + +#ifndef ROCKSDB_LITE +int DBTestBase::NumSortedRuns(int cf) { + ColumnFamilyMetaData cf_meta; + if (cf == 0) { + db_->GetColumnFamilyMetaData(&cf_meta); + } else { + db_->GetColumnFamilyMetaData(handles_[cf], &cf_meta); + } + int num_sr = static_cast(cf_meta.levels[0].files.size()); + for (size_t i = 1U; i < cf_meta.levels.size(); i++) { + if (cf_meta.levels[i].files.size() > 0) { + num_sr++; + } + } + return num_sr; +} + +uint64_t DBTestBase::TotalSize(int cf) { + ColumnFamilyMetaData cf_meta; + if (cf == 0) { + db_->GetColumnFamilyMetaData(&cf_meta); + } else { + db_->GetColumnFamilyMetaData(handles_[cf], &cf_meta); + } + return cf_meta.size; +} + +uint64_t DBTestBase::SizeAtLevel(int level) { + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + uint64_t sum = 0; + for (const auto& m : metadata) { + if (m.level == level) { + sum += m.size; + } + } + return sum; +} + +size_t DBTestBase::TotalLiveFiles(int cf) { + ColumnFamilyMetaData cf_meta; + if (cf == 0) { + db_->GetColumnFamilyMetaData(&cf_meta); + } else { + db_->GetColumnFamilyMetaData(handles_[cf], &cf_meta); + } + size_t num_files = 0; + for (auto& level : cf_meta.levels) { + num_files += level.files.size(); + } + return num_files; +} + +size_t DBTestBase::CountLiveFiles() { + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + return metadata.size(); +} + +int DBTestBase::NumTableFilesAtLevel(int level, int cf) { + std::string property; + if (cf == 0) { + // default cfd + EXPECT_TRUE(db_->GetProperty( + "rocksdb.num-files-at-level" + NumberToString(level), &property)); + } else { + EXPECT_TRUE(db_->GetProperty( + handles_[cf], "rocksdb.num-files-at-level" + NumberToString(level), + &property)); + } + return atoi(property.c_str()); +} + +double DBTestBase::CompressionRatioAtLevel(int level, int cf) { + std::string property; + if (cf == 0) { + // default cfd + EXPECT_TRUE(db_->GetProperty( + "rocksdb.compression-ratio-at-level" + NumberToString(level), + &property)); + } else { + EXPECT_TRUE(db_->GetProperty( + handles_[cf], + "rocksdb.compression-ratio-at-level" + NumberToString(level), + &property)); + } + return std::stod(property); +} + +int DBTestBase::TotalTableFiles(int cf, int levels) { + if (levels == -1) { + levels = (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[1]); + } + int result = 0; + for (int level = 0; level < levels; level++) { + result += NumTableFilesAtLevel(level, cf); + } + return result; +} + +// Return spread of files per level +std::string DBTestBase::FilesPerLevel(int cf) { + int num_levels = + (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[1]); + std::string result; + size_t last_non_zero_offset = 0; + for (int level = 0; level < num_levels; level++) { + int f = NumTableFilesAtLevel(level, cf); + char buf[100]; + snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f); + result += buf; + if (f > 0) { + last_non_zero_offset = result.size(); + } + } + result.resize(last_non_zero_offset); + return result; +} +#endif // !ROCKSDB_LITE + +size_t DBTestBase::CountFiles() { + std::vector files; + env_->GetChildren(dbname_, &files); + + std::vector logfiles; + if (dbname_ != last_options_.wal_dir) { + env_->GetChildren(last_options_.wal_dir, &logfiles); + } + + return files.size() + logfiles.size(); +} + +uint64_t DBTestBase::Size(const Slice& start, const Slice& limit, int cf) { + Range r(start, limit); + uint64_t size; + if (cf == 0) { + db_->GetApproximateSizes(&r, 1, &size); + } else { + db_->GetApproximateSizes(handles_[1], &r, 1, &size); + } + return size; +} + +void DBTestBase::Compact(int cf, const Slice& start, const Slice& limit, + uint32_t target_path_id) { + CompactRangeOptions compact_options; + compact_options.target_path_id = target_path_id; + ASSERT_OK(db_->CompactRange(compact_options, handles_[cf], &start, &limit)); +} + +void DBTestBase::Compact(int cf, const Slice& start, const Slice& limit) { + ASSERT_OK( + db_->CompactRange(CompactRangeOptions(), handles_[cf], &start, &limit)); +} + +void DBTestBase::Compact(const Slice& start, const Slice& limit) { + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &limit)); +} + +// Do n memtable compactions, each of which produces an sstable +// covering the range [small,large]. +void DBTestBase::MakeTables(int n, const std::string& small, + const std::string& large, int cf) { + for (int i = 0; i < n; i++) { + ASSERT_OK(Put(cf, small, "begin")); + ASSERT_OK(Put(cf, large, "end")); + ASSERT_OK(Flush(cf)); + MoveFilesToLevel(n - i - 1, cf); + } +} + +// Prevent pushing of new sstables into deeper levels by adding +// tables that cover a specified range to all levels. +void DBTestBase::FillLevels(const std::string& smallest, + const std::string& largest, int cf) { + MakeTables(db_->NumberLevels(handles_[cf]), smallest, largest, cf); +} + +void DBTestBase::MoveFilesToLevel(int level, int cf) { + for (int l = 0; l < level; ++l) { + if (cf > 0) { + dbfull()->TEST_CompactRange(l, nullptr, nullptr, handles_[cf]); + } else { + dbfull()->TEST_CompactRange(l, nullptr, nullptr); + } + } +} + +#ifndef ROCKSDB_LITE +void DBTestBase::DumpFileCounts(const char* label) { + fprintf(stderr, "---\n%s:\n", label); + fprintf(stderr, "maxoverlap: %" PRIu64 "\n", + dbfull()->TEST_MaxNextLevelOverlappingBytes()); + for (int level = 0; level < db_->NumberLevels(); level++) { + int num = NumTableFilesAtLevel(level); + if (num > 0) { + fprintf(stderr, " level %3d : %d files\n", level, num); + } + } +} +#endif // !ROCKSDB_LITE + +std::string DBTestBase::DumpSSTableList() { + std::string property; + db_->GetProperty("rocksdb.sstables", &property); + return property; +} + +void DBTestBase::GetSstFiles(Env* env, std::string path, + std::vector* files) { + env->GetChildren(path, files); + + files->erase( + std::remove_if(files->begin(), files->end(), [](std::string name) { + uint64_t number; + FileType type; + return !(ParseFileName(name, &number, &type) && type == kTableFile); + }), files->end()); +} + +int DBTestBase::GetSstFileCount(std::string path) { + std::vector files; + DBTestBase::GetSstFiles(env_, path, &files); + return static_cast(files.size()); +} + +// this will generate non-overlapping files since it keeps increasing key_idx +void DBTestBase::GenerateNewFile(int cf, Random* rnd, int* key_idx, + bool nowait) { + for (int i = 0; i < KNumKeysByGenerateNewFile; i++) { + ASSERT_OK(Put(cf, Key(*key_idx), RandomString(rnd, (i == 99) ? 1 : 990))); + (*key_idx)++; + } + if (!nowait) { + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + } +} + +// this will generate non-overlapping files since it keeps increasing key_idx +void DBTestBase::GenerateNewFile(Random* rnd, int* key_idx, bool nowait) { + for (int i = 0; i < KNumKeysByGenerateNewFile; i++) { + ASSERT_OK(Put(Key(*key_idx), RandomString(rnd, (i == 99) ? 1 : 990))); + (*key_idx)++; + } + if (!nowait) { + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + } +} + +const int DBTestBase::kNumKeysByGenerateNewRandomFile = 51; + +void DBTestBase::GenerateNewRandomFile(Random* rnd, bool nowait) { + for (int i = 0; i < kNumKeysByGenerateNewRandomFile; i++) { + ASSERT_OK(Put("key" + RandomString(rnd, 7), RandomString(rnd, 2000))); + } + ASSERT_OK(Put("key" + RandomString(rnd, 7), RandomString(rnd, 200))); + if (!nowait) { + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + } +} + +std::string DBTestBase::IterStatus(Iterator* iter) { + std::string result; + if (iter->Valid()) { + result = iter->key().ToString() + "->" + iter->value().ToString(); + } else { + result = "(invalid)"; + } + return result; +} + +Options DBTestBase::OptionsForLogIterTest() { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.WAL_ttl_seconds = 1000; + return options; +} + +std::string DBTestBase::DummyString(size_t len, char c) { + return std::string(len, c); +} + +void DBTestBase::VerifyIterLast(std::string expected_key, int cf) { + Iterator* iter; + ReadOptions ro; + if (cf == 0) { + iter = db_->NewIterator(ro); + } else { + iter = db_->NewIterator(ro, handles_[cf]); + } + iter->SeekToLast(); + ASSERT_EQ(IterStatus(iter), expected_key); + delete iter; +} + +// Used to test InplaceUpdate + +// If previous value is nullptr or delta is > than previous value, +// sets newValue with delta +// If previous value is not empty, +// updates previous value with 'b' string of previous value size - 1. +UpdateStatus DBTestBase::updateInPlaceSmallerSize(char* prevValue, + uint32_t* prevSize, + Slice delta, + std::string* newValue) { + if (prevValue == nullptr) { + *newValue = std::string(delta.size(), 'c'); + return UpdateStatus::UPDATED; + } else { + *prevSize = *prevSize - 1; + std::string str_b = std::string(*prevSize, 'b'); + memcpy(prevValue, str_b.c_str(), str_b.size()); + return UpdateStatus::UPDATED_INPLACE; + } +} + +UpdateStatus DBTestBase::updateInPlaceSmallerVarintSize(char* prevValue, + uint32_t* prevSize, + Slice delta, + std::string* newValue) { + if (prevValue == nullptr) { + *newValue = std::string(delta.size(), 'c'); + return UpdateStatus::UPDATED; + } else { + *prevSize = 1; + std::string str_b = std::string(*prevSize, 'b'); + memcpy(prevValue, str_b.c_str(), str_b.size()); + return UpdateStatus::UPDATED_INPLACE; + } +} + +UpdateStatus DBTestBase::updateInPlaceLargerSize(char* /*prevValue*/, + uint32_t* /*prevSize*/, + Slice delta, + std::string* newValue) { + *newValue = std::string(delta.size(), 'c'); + return UpdateStatus::UPDATED; +} + +UpdateStatus DBTestBase::updateInPlaceNoAction(char* /*prevValue*/, + uint32_t* /*prevSize*/, + Slice /*delta*/, + std::string* /*newValue*/) { + return UpdateStatus::UPDATE_FAILED; +} + +// Utility method to test InplaceUpdate +void DBTestBase::validateNumberOfEntries(int numValues, int cf) { + Arena arena; + auto options = CurrentOptions(); + InternalKeyComparator icmp(options.comparator); + ReadRangeDelAggregator range_del_agg(&icmp, + kMaxSequenceNumber /* upper_bound */); + // This should be defined after range_del_agg so that it destructs the + // assigned iterator before it range_del_agg is already destructed. + ScopedArenaIterator iter; + if (cf != 0) { + iter.set(dbfull()->NewInternalIterator(&arena, &range_del_agg, + kMaxSequenceNumber, handles_[cf])); + } else { + iter.set(dbfull()->NewInternalIterator(&arena, &range_del_agg, + kMaxSequenceNumber)); + } + iter->SeekToFirst(); + ASSERT_EQ(iter->status().ok(), true); + int seq = numValues; + while (iter->Valid()) { + ParsedInternalKey ikey; + ikey.clear(); + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + + // checks sequence number for updates + ASSERT_EQ(ikey.sequence, (unsigned)seq--); + iter->Next(); + } + ASSERT_EQ(0, seq); +} + +void DBTestBase::CopyFile(const std::string& source, + const std::string& destination, uint64_t size) { + const EnvOptions soptions; + std::unique_ptr srcfile; + ASSERT_OK(env_->NewSequentialFile(source, &srcfile, soptions)); + std::unique_ptr destfile; + ASSERT_OK(env_->NewWritableFile(destination, &destfile, soptions)); + + if (size == 0) { + // default argument means copy everything + ASSERT_OK(env_->GetFileSize(source, &size)); + } + + char buffer[4096]; + Slice slice; + while (size > 0) { + uint64_t one = std::min(uint64_t(sizeof(buffer)), size); + ASSERT_OK(srcfile->Read(one, &slice, buffer)); + ASSERT_OK(destfile->Append(slice)); + size -= slice.size(); + } + ASSERT_OK(destfile->Close()); +} + +std::unordered_map DBTestBase::GetAllSSTFiles( + uint64_t* total_size) { + std::unordered_map res; + + if (total_size) { + *total_size = 0; + } + std::vector files; + env_->GetChildren(dbname_, &files); + for (auto& file_name : files) { + uint64_t number; + FileType type; + std::string file_path = dbname_ + "/" + file_name; + if (ParseFileName(file_name, &number, &type) && type == kTableFile) { + uint64_t file_size = 0; + env_->GetFileSize(file_path, &file_size); + res[file_path] = file_size; + if (total_size) { + *total_size += file_size; + } + } + } + return res; +} + +std::vector DBTestBase::ListTableFiles(Env* env, + const std::string& path) { + std::vector files; + std::vector file_numbers; + env->GetChildren(path, &files); + uint64_t number; + FileType type; + for (size_t i = 0; i < files.size(); ++i) { + if (ParseFileName(files[i], &number, &type)) { + if (type == kTableFile) { + file_numbers.push_back(number); + } + } + } + return file_numbers; +} + +void DBTestBase::VerifyDBFromMap(std::map true_data, + size_t* total_reads_res, bool tailing_iter, + std::map status) { + size_t total_reads = 0; + + for (auto& kv : true_data) { + Status s = status[kv.first]; + if (s.ok()) { + ASSERT_EQ(Get(kv.first), kv.second); + } else { + std::string value; + ASSERT_EQ(s, db_->Get(ReadOptions(), kv.first, &value)); + } + total_reads++; + } + + // Normal Iterator + { + int iter_cnt = 0; + ReadOptions ro; + ro.total_order_seek = true; + Iterator* iter = db_->NewIterator(ro); + // Verify Iterator::Next() + iter_cnt = 0; + auto data_iter = true_data.begin(); + Status s; + for (iter->SeekToFirst(); iter->Valid(); iter->Next(), data_iter++) { + ASSERT_EQ(iter->key().ToString(), data_iter->first); + Status current_status = status[data_iter->first]; + if (!current_status.ok()) { + s = current_status; + } + ASSERT_EQ(iter->status(), s); + if (current_status.ok()) { + ASSERT_EQ(iter->value().ToString(), data_iter->second); + } + iter_cnt++; + total_reads++; + } + ASSERT_EQ(data_iter, true_data.end()) << iter_cnt << " / " + << true_data.size(); + delete iter; + + // Verify Iterator::Prev() + // Use a new iterator to make sure its status is clean. + iter = db_->NewIterator(ro); + iter_cnt = 0; + s = Status::OK(); + auto data_rev = true_data.rbegin(); + for (iter->SeekToLast(); iter->Valid(); iter->Prev(), data_rev++) { + ASSERT_EQ(iter->key().ToString(), data_rev->first); + Status current_status = status[data_rev->first]; + if (!current_status.ok()) { + s = current_status; + } + ASSERT_EQ(iter->status(), s); + if (current_status.ok()) { + ASSERT_EQ(iter->value().ToString(), data_rev->second); + } + iter_cnt++; + total_reads++; + } + ASSERT_EQ(data_rev, true_data.rend()) << iter_cnt << " / " + << true_data.size(); + + // Verify Iterator::Seek() + for (auto kv : true_data) { + iter->Seek(kv.first); + ASSERT_EQ(kv.first, iter->key().ToString()); + ASSERT_EQ(kv.second, iter->value().ToString()); + total_reads++; + } + delete iter; + } + + if (tailing_iter) { +#ifndef ROCKSDB_LITE + // Tailing iterator + int iter_cnt = 0; + ReadOptions ro; + ro.tailing = true; + ro.total_order_seek = true; + Iterator* iter = db_->NewIterator(ro); + + // Verify ForwardIterator::Next() + iter_cnt = 0; + auto data_iter = true_data.begin(); + for (iter->SeekToFirst(); iter->Valid(); iter->Next(), data_iter++) { + ASSERT_EQ(iter->key().ToString(), data_iter->first); + ASSERT_EQ(iter->value().ToString(), data_iter->second); + iter_cnt++; + total_reads++; + } + ASSERT_EQ(data_iter, true_data.end()) << iter_cnt << " / " + << true_data.size(); + + // Verify ForwardIterator::Seek() + for (auto kv : true_data) { + iter->Seek(kv.first); + ASSERT_EQ(kv.first, iter->key().ToString()); + ASSERT_EQ(kv.second, iter->value().ToString()); + total_reads++; + } + + delete iter; +#endif // ROCKSDB_LITE + } + + if (total_reads_res) { + *total_reads_res = total_reads; + } +} + +void DBTestBase::VerifyDBInternal( + std::vector> true_data) { + Arena arena; + InternalKeyComparator icmp(last_options_.comparator); + ReadRangeDelAggregator range_del_agg(&icmp, + kMaxSequenceNumber /* upper_bound */); + auto iter = + dbfull()->NewInternalIterator(&arena, &range_del_agg, kMaxSequenceNumber); + iter->SeekToFirst(); + for (auto p : true_data) { + ASSERT_TRUE(iter->Valid()); + ParsedInternalKey ikey; + ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey)); + ASSERT_EQ(p.first, ikey.user_key); + ASSERT_EQ(p.second, iter->value()); + iter->Next(); + }; + ASSERT_FALSE(iter->Valid()); + iter->~InternalIterator(); +} + +#ifndef ROCKSDB_LITE + +uint64_t DBTestBase::GetNumberOfSstFilesForColumnFamily( + DB* db, std::string column_family_name) { + std::vector metadata; + db->GetLiveFilesMetaData(&metadata); + uint64_t result = 0; + for (auto& fileMetadata : metadata) { + result += (fileMetadata.column_family_name == column_family_name); + } + return result; +} +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_test_util.h b/src/rocksdb/db/db_test_util.h new file mode 100644 index 000000000..eeabea9bd --- /dev/null +++ b/src/rocksdb/db/db_test_util.h @@ -0,0 +1,1000 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "db/dbformat.h" +#include "env/mock_env.h" +#include "file/filename.h" +#include "memtable/hash_linklist_rep.h" +#include "rocksdb/cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/convenience.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/sst_file_writer.h" +#include "rocksdb/statistics.h" +#include "rocksdb/table.h" +#include "rocksdb/utilities/checkpoint.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/mock_table.h" +#include "table/plain/plain_table_factory.h" +#include "table/scoped_arena_iterator.h" +#include "test_util/mock_time_env.h" +#include "util/compression.h" +#include "util/mutexlock.h" + +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/string_util.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { + +namespace anon { +class AtomicCounter { + public: + explicit AtomicCounter(Env* env = NULL) + : env_(env), cond_count_(&mu_), count_(0) {} + + void Increment() { + MutexLock l(&mu_); + count_++; + cond_count_.SignalAll(); + } + + int Read() { + MutexLock l(&mu_); + return count_; + } + + bool WaitFor(int count) { + MutexLock l(&mu_); + + uint64_t start = env_->NowMicros(); + while (count_ < count) { + uint64_t now = env_->NowMicros(); + cond_count_.TimedWait(now + /*1s*/ 1 * 1000 * 1000); + if (env_->NowMicros() - start > /*10s*/ 10 * 1000 * 1000) { + return false; + } + if (count_ < count) { + GTEST_LOG_(WARNING) << "WaitFor is taking more time than usual"; + } + } + + return true; + } + + void Reset() { + MutexLock l(&mu_); + count_ = 0; + cond_count_.SignalAll(); + } + + private: + Env* env_; + port::Mutex mu_; + port::CondVar cond_count_; + int count_; +}; + +struct OptionsOverride { + std::shared_ptr filter_policy = nullptr; + // These will be used only if filter_policy is set + bool partition_filters = false; + uint64_t metadata_block_size = 1024; + + // Used as a bit mask of individual enums in which to skip an XF test point + int skip_policy = 0; +}; + +} // namespace anon + +enum SkipPolicy { kSkipNone = 0, kSkipNoSnapshot = 1, kSkipNoPrefix = 2 }; + +// A hacky skip list mem table that triggers flush after number of entries. +class SpecialMemTableRep : public MemTableRep { + public: + explicit SpecialMemTableRep(Allocator* allocator, MemTableRep* memtable, + int num_entries_flush) + : MemTableRep(allocator), + memtable_(memtable), + num_entries_flush_(num_entries_flush), + num_entries_(0) {} + + virtual KeyHandle Allocate(const size_t len, char** buf) override { + return memtable_->Allocate(len, buf); + } + + // Insert key into the list. + // REQUIRES: nothing that compares equal to key is currently in the list. + virtual void Insert(KeyHandle handle) override { + num_entries_++; + memtable_->Insert(handle); + } + + void InsertConcurrently(KeyHandle handle) override { + num_entries_++; + memtable_->Insert(handle); + } + + // Returns true iff an entry that compares equal to key is in the list. + virtual bool Contains(const char* key) const override { + return memtable_->Contains(key); + } + + virtual size_t ApproximateMemoryUsage() override { + // Return a high memory usage when number of entries exceeds the threshold + // to trigger a flush. + return (num_entries_ < num_entries_flush_) ? 0 : 1024 * 1024 * 1024; + } + + virtual void Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, + const char* entry)) override { + memtable_->Get(k, callback_args, callback_func); + } + + uint64_t ApproximateNumEntries(const Slice& start_ikey, + const Slice& end_ikey) override { + return memtable_->ApproximateNumEntries(start_ikey, end_ikey); + } + + virtual MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override { + return memtable_->GetIterator(arena); + } + + virtual ~SpecialMemTableRep() override {} + + private: + std::unique_ptr memtable_; + int num_entries_flush_; + int num_entries_; +}; + +// The factory for the hacky skip list mem table that triggers flush after +// number of entries exceeds a threshold. +class SpecialSkipListFactory : public MemTableRepFactory { + public: + // After number of inserts exceeds `num_entries_flush` in a mem table, trigger + // flush. + explicit SpecialSkipListFactory(int num_entries_flush) + : num_entries_flush_(num_entries_flush) {} + + using MemTableRepFactory::CreateMemTableRep; + virtual MemTableRep* CreateMemTableRep( + const MemTableRep::KeyComparator& compare, Allocator* allocator, + const SliceTransform* transform, Logger* /*logger*/) override { + return new SpecialMemTableRep( + allocator, factory_.CreateMemTableRep(compare, allocator, transform, 0), + num_entries_flush_); + } + virtual const char* Name() const override { return "SkipListFactory"; } + + bool IsInsertConcurrentlySupported() const override { + return factory_.IsInsertConcurrentlySupported(); + } + + private: + SkipListFactory factory_; + int num_entries_flush_; +}; + +// Special Env used to delay background operations +class SpecialEnv : public EnvWrapper { + public: + explicit SpecialEnv(Env* base); + + Status NewWritableFile(const std::string& f, std::unique_ptr* r, + const EnvOptions& soptions) override { + class SSTableFile : public WritableFile { + private: + SpecialEnv* env_; + std::unique_ptr base_; + + public: + SSTableFile(SpecialEnv* env, std::unique_ptr&& base) + : env_(env), base_(std::move(base)) {} + Status Append(const Slice& data) override { + if (env_->table_write_callback_) { + (*env_->table_write_callback_)(); + } + if (env_->drop_writes_.load(std::memory_order_acquire)) { + // Drop writes on the floor + return Status::OK(); + } else if (env_->no_space_.load(std::memory_order_acquire)) { + return Status::NoSpace("No space left on device"); + } else { + env_->bytes_written_ += data.size(); + return base_->Append(data); + } + } + Status PositionedAppend(const Slice& data, uint64_t offset) override { + if (env_->table_write_callback_) { + (*env_->table_write_callback_)(); + } + if (env_->drop_writes_.load(std::memory_order_acquire)) { + // Drop writes on the floor + return Status::OK(); + } else if (env_->no_space_.load(std::memory_order_acquire)) { + return Status::NoSpace("No space left on device"); + } else { + env_->bytes_written_ += data.size(); + return base_->PositionedAppend(data, offset); + } + } + Status Truncate(uint64_t size) override { return base_->Truncate(size); } + Status RangeSync(uint64_t offset, uint64_t nbytes) override { + Status s = base_->RangeSync(offset, nbytes); +#if !(defined NDEBUG) || !defined(OS_WIN) + TEST_SYNC_POINT_CALLBACK("SpecialEnv::SStableFile::RangeSync", &s); +#endif // !(defined NDEBUG) || !defined(OS_WIN) + return s; + } + Status Close() override { +// SyncPoint is not supported in Released Windows Mode. +#if !(defined NDEBUG) || !defined(OS_WIN) + // Check preallocation size + // preallocation size is never passed to base file. + size_t preallocation_size = preallocation_block_size(); + TEST_SYNC_POINT_CALLBACK("DBTestWritableFile.GetPreallocationStatus", + &preallocation_size); +#endif // !(defined NDEBUG) || !defined(OS_WIN) + Status s = base_->Close(); +#if !(defined NDEBUG) || !defined(OS_WIN) + TEST_SYNC_POINT_CALLBACK("SpecialEnv::SStableFile::Close", &s); +#endif // !(defined NDEBUG) || !defined(OS_WIN) + return s; + } + Status Flush() override { return base_->Flush(); } + Status Sync() override { + ++env_->sync_counter_; + while (env_->delay_sstable_sync_.load(std::memory_order_acquire)) { + env_->SleepForMicroseconds(100000); + } + Status s = base_->Sync(); +#if !(defined NDEBUG) || !defined(OS_WIN) + TEST_SYNC_POINT_CALLBACK("SpecialEnv::SStableFile::Sync", &s); +#endif // !(defined NDEBUG) || !defined(OS_WIN) + return s; + } + void SetIOPriority(Env::IOPriority pri) override { + base_->SetIOPriority(pri); + } + Env::IOPriority GetIOPriority() override { + return base_->GetIOPriority(); + } + bool use_direct_io() const override { + return base_->use_direct_io(); + } + Status Allocate(uint64_t offset, uint64_t len) override { + return base_->Allocate(offset, len); + } + }; + class ManifestFile : public WritableFile { + public: + ManifestFile(SpecialEnv* env, std::unique_ptr&& b) + : env_(env), base_(std::move(b)) {} + Status Append(const Slice& data) override { + if (env_->manifest_write_error_.load(std::memory_order_acquire)) { + return Status::IOError("simulated writer error"); + } else { + return base_->Append(data); + } + } + Status Truncate(uint64_t size) override { return base_->Truncate(size); } + Status Close() override { return base_->Close(); } + Status Flush() override { return base_->Flush(); } + Status Sync() override { + ++env_->sync_counter_; + if (env_->manifest_sync_error_.load(std::memory_order_acquire)) { + return Status::IOError("simulated sync error"); + } else { + return base_->Sync(); + } + } + uint64_t GetFileSize() override { return base_->GetFileSize(); } + Status Allocate(uint64_t offset, uint64_t len) override { + return base_->Allocate(offset, len); + } + + private: + SpecialEnv* env_; + std::unique_ptr base_; + }; + class WalFile : public WritableFile { + public: + WalFile(SpecialEnv* env, std::unique_ptr&& b) + : env_(env), base_(std::move(b)) { + env_->num_open_wal_file_.fetch_add(1); + } + virtual ~WalFile() { env_->num_open_wal_file_.fetch_add(-1); } + Status Append(const Slice& data) override { +#if !(defined NDEBUG) || !defined(OS_WIN) + TEST_SYNC_POINT("SpecialEnv::WalFile::Append:1"); +#endif + Status s; + if (env_->log_write_error_.load(std::memory_order_acquire)) { + s = Status::IOError("simulated writer error"); + } else { + int slowdown = + env_->log_write_slowdown_.load(std::memory_order_acquire); + if (slowdown > 0) { + env_->SleepForMicroseconds(slowdown); + } + s = base_->Append(data); + } +#if !(defined NDEBUG) || !defined(OS_WIN) + TEST_SYNC_POINT("SpecialEnv::WalFile::Append:2"); +#endif + return s; + } + Status Truncate(uint64_t size) override { return base_->Truncate(size); } + Status Close() override { +// SyncPoint is not supported in Released Windows Mode. +#if !(defined NDEBUG) || !defined(OS_WIN) + // Check preallocation size + // preallocation size is never passed to base file. + size_t preallocation_size = preallocation_block_size(); + TEST_SYNC_POINT_CALLBACK("DBTestWalFile.GetPreallocationStatus", + &preallocation_size); +#endif // !(defined NDEBUG) || !defined(OS_WIN) + + return base_->Close(); + } + Status Flush() override { return base_->Flush(); } + Status Sync() override { + ++env_->sync_counter_; + return base_->Sync(); + } + bool IsSyncThreadSafe() const override { + return env_->is_wal_sync_thread_safe_.load(); + } + Status Allocate(uint64_t offset, uint64_t len) override { + return base_->Allocate(offset, len); + } + + private: + SpecialEnv* env_; + std::unique_ptr base_; + }; + + if (non_writeable_rate_.load(std::memory_order_acquire) > 0) { + uint32_t random_number; + { + MutexLock l(&rnd_mutex_); + random_number = rnd_.Uniform(100); + } + if (random_number < non_writeable_rate_.load()) { + return Status::IOError("simulated random write error"); + } + } + + new_writable_count_++; + + if (non_writable_count_.load() > 0) { + non_writable_count_--; + return Status::IOError("simulated write error"); + } + + EnvOptions optimized = soptions; + if (strstr(f.c_str(), "MANIFEST") != nullptr || + strstr(f.c_str(), "log") != nullptr) { + optimized.use_mmap_writes = false; + optimized.use_direct_writes = false; + } + + Status s = target()->NewWritableFile(f, r, optimized); + if (s.ok()) { + if (strstr(f.c_str(), ".sst") != nullptr) { + r->reset(new SSTableFile(this, std::move(*r))); + } else if (strstr(f.c_str(), "MANIFEST") != nullptr) { + r->reset(new ManifestFile(this, std::move(*r))); + } else if (strstr(f.c_str(), "log") != nullptr) { + r->reset(new WalFile(this, std::move(*r))); + } + } + return s; + } + + Status NewRandomAccessFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& soptions) override { + class CountingFile : public RandomAccessFile { + public: + CountingFile(std::unique_ptr&& target, + anon::AtomicCounter* counter, + std::atomic* bytes_read) + : target_(std::move(target)), + counter_(counter), + bytes_read_(bytes_read) {} + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override { + counter_->Increment(); + Status s = target_->Read(offset, n, result, scratch); + *bytes_read_ += result->size(); + return s; + } + + virtual Status Prefetch(uint64_t offset, size_t n) override { + Status s = target_->Prefetch(offset, n); + *bytes_read_ += n; + return s; + } + + private: + std::unique_ptr target_; + anon::AtomicCounter* counter_; + std::atomic* bytes_read_; + }; + + Status s = target()->NewRandomAccessFile(f, r, soptions); + random_file_open_counter_++; + if (s.ok() && count_random_reads_) { + r->reset(new CountingFile(std::move(*r), &random_read_counter_, + &random_read_bytes_counter_)); + } + if (s.ok() && soptions.compaction_readahead_size > 0) { + compaction_readahead_size_ = soptions.compaction_readahead_size; + } + return s; + } + + virtual Status NewSequentialFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& soptions) override { + class CountingFile : public SequentialFile { + public: + CountingFile(std::unique_ptr&& target, + anon::AtomicCounter* counter) + : target_(std::move(target)), counter_(counter) {} + virtual Status Read(size_t n, Slice* result, char* scratch) override { + counter_->Increment(); + return target_->Read(n, result, scratch); + } + virtual Status Skip(uint64_t n) override { return target_->Skip(n); } + + private: + std::unique_ptr target_; + anon::AtomicCounter* counter_; + }; + + Status s = target()->NewSequentialFile(f, r, soptions); + if (s.ok() && count_sequential_reads_) { + r->reset(new CountingFile(std::move(*r), &sequential_read_counter_)); + } + return s; + } + + virtual void SleepForMicroseconds(int micros) override { + sleep_counter_.Increment(); + if (no_slowdown_ || time_elapse_only_sleep_) { + addon_time_.fetch_add(micros); + } + if (!no_slowdown_) { + target()->SleepForMicroseconds(micros); + } + } + + virtual Status GetCurrentTime(int64_t* unix_time) override { + Status s; + if (!time_elapse_only_sleep_) { + s = target()->GetCurrentTime(unix_time); + } + if (s.ok()) { + *unix_time += addon_time_.load(); + } + return s; + } + + virtual uint64_t NowCPUNanos() override { + now_cpu_count_.fetch_add(1); + return target()->NowCPUNanos(); + } + + virtual uint64_t NowNanos() override { + return (time_elapse_only_sleep_ ? 0 : target()->NowNanos()) + + addon_time_.load() * 1000; + } + + virtual uint64_t NowMicros() override { + return (time_elapse_only_sleep_ ? 0 : target()->NowMicros()) + + addon_time_.load(); + } + + virtual Status DeleteFile(const std::string& fname) override { + delete_count_.fetch_add(1); + return target()->DeleteFile(fname); + } + + Random rnd_; + port::Mutex rnd_mutex_; // Lock to pretect rnd_ + + // sstable Sync() calls are blocked while this pointer is non-nullptr. + std::atomic delay_sstable_sync_; + + // Drop writes on the floor while this pointer is non-nullptr. + std::atomic drop_writes_; + + // Simulate no-space errors while this pointer is non-nullptr. + std::atomic no_space_; + + // Simulate non-writable file system while this pointer is non-nullptr + std::atomic non_writable_; + + // Force sync of manifest files to fail while this pointer is non-nullptr + std::atomic manifest_sync_error_; + + // Force write to manifest files to fail while this pointer is non-nullptr + std::atomic manifest_write_error_; + + // Force write to log files to fail while this pointer is non-nullptr + std::atomic log_write_error_; + + // Slow down every log write, in micro-seconds. + std::atomic log_write_slowdown_; + + // Number of WAL files that are still open for write. + std::atomic num_open_wal_file_; + + bool count_random_reads_; + anon::AtomicCounter random_read_counter_; + std::atomic random_read_bytes_counter_; + std::atomic random_file_open_counter_; + + bool count_sequential_reads_; + anon::AtomicCounter sequential_read_counter_; + + anon::AtomicCounter sleep_counter_; + + std::atomic bytes_written_; + + std::atomic sync_counter_; + + std::atomic non_writeable_rate_; + + std::atomic new_writable_count_; + + std::atomic non_writable_count_; + + std::function* table_write_callback_; + + std::atomic addon_time_; + + std::atomic now_cpu_count_; + + std::atomic delete_count_; + + std::atomic time_elapse_only_sleep_; + + bool no_slowdown_; + + std::atomic is_wal_sync_thread_safe_{true}; + + std::atomic compaction_readahead_size_{}; +}; + +#ifndef ROCKSDB_LITE +class OnFileDeletionListener : public EventListener { + public: + OnFileDeletionListener() : matched_count_(0), expected_file_name_("") {} + + void SetExpectedFileName(const std::string file_name) { + expected_file_name_ = file_name; + } + + void VerifyMatchedCount(size_t expected_value) { + ASSERT_EQ(matched_count_, expected_value); + } + + void OnTableFileDeleted(const TableFileDeletionInfo& info) override { + if (expected_file_name_ != "") { + ASSERT_EQ(expected_file_name_, info.file_path); + expected_file_name_ = ""; + matched_count_++; + } + } + + private: + size_t matched_count_; + std::string expected_file_name_; +}; +#endif + +// A test merge operator mimics put but also fails if one of merge operands is +// "corrupted". +class TestPutOperator : public MergeOperator { + public: + virtual bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override { + if (merge_in.existing_value != nullptr && + *(merge_in.existing_value) == "corrupted") { + return false; + } + for (auto value : merge_in.operand_list) { + if (value == "corrupted") { + return false; + } + } + merge_out->existing_operand = merge_in.operand_list.back(); + return true; + } + + virtual const char* Name() const override { return "TestPutOperator"; } +}; + +class DBTestBase : public testing::Test { + public: + // Sequence of option configurations to try + enum OptionConfig : int { + kDefault = 0, + kBlockBasedTableWithPrefixHashIndex = 1, + kBlockBasedTableWithWholeKeyHashIndex = 2, + kPlainTableFirstBytePrefix = 3, + kPlainTableCappedPrefix = 4, + kPlainTableCappedPrefixNonMmap = 5, + kPlainTableAllBytesPrefix = 6, + kVectorRep = 7, + kHashLinkList = 8, + kMergePut = 9, + kFilter = 10, + kFullFilterWithNewTableReaderForCompactions = 11, + kUncompressed = 12, + kNumLevel_3 = 13, + kDBLogDir = 14, + kWalDirAndMmapReads = 15, + kManifestFileSize = 16, + kPerfOptions = 17, + kHashSkipList = 18, + kUniversalCompaction = 19, + kUniversalCompactionMultiLevel = 20, + kCompressedBlockCache = 21, + kInfiniteMaxOpenFiles = 22, + kxxHashChecksum = 23, + kFIFOCompaction = 24, + kOptimizeFiltersForHits = 25, + kRowCache = 26, + kRecycleLogFiles = 27, + kConcurrentSkipList = 28, + kPipelinedWrite = 29, + kConcurrentWALWrites = 30, + kDirectIO, + kLevelSubcompactions, + kBlockBasedTableWithIndexRestartInterval, + kBlockBasedTableWithPartitionedIndex, + kBlockBasedTableWithPartitionedIndexFormat4, + kPartitionedFilterWithNewTableReaderForCompactions, + kUniversalSubcompactions, + kxxHash64Checksum, + kUnorderedWrite, + // This must be the last line + kEnd, + }; + + public: + std::string dbname_; + std::string alternative_wal_dir_; + std::string alternative_db_log_dir_; + MockEnv* mem_env_; + Env* encrypted_env_; + SpecialEnv* env_; + std::shared_ptr env_guard_; + DB* db_; + std::vector handles_; + + int option_config_; + Options last_options_; + + // Skip some options, as they may not be applicable to a specific test. + // To add more skip constants, use values 4, 8, 16, etc. + enum OptionSkip { + kNoSkip = 0, + kSkipDeletesFilterFirst = 1, + kSkipUniversalCompaction = 2, + kSkipMergePut = 4, + kSkipPlainTable = 8, + kSkipHashIndex = 16, + kSkipNoSeekToLast = 32, + kSkipFIFOCompaction = 128, + kSkipMmapReads = 256, + }; + + const int kRangeDelSkipConfigs = + // Plain tables do not support range deletions. + kSkipPlainTable | + // MmapReads disables the iterator pinning that RangeDelAggregator + // requires. + kSkipMmapReads; + + explicit DBTestBase(const std::string path); + + ~DBTestBase(); + + static std::string RandomString(Random* rnd, int len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; + } + + static std::string Key(int i) { + char buf[100]; + snprintf(buf, sizeof(buf), "key%06d", i); + return std::string(buf); + } + + static bool ShouldSkipOptions(int option_config, int skip_mask = kNoSkip); + + // Switch to a fresh database with the next option configuration to + // test. Return false if there are no more configurations to test. + bool ChangeOptions(int skip_mask = kNoSkip); + + // Switch between different compaction styles. + bool ChangeCompactOptions(); + + // Switch between different WAL-realted options. + bool ChangeWalOptions(); + + // Switch between different filter policy + // Jump from kDefault to kFilter to kFullFilter + bool ChangeFilterOptions(); + + // Switch between different DB options for file ingestion tests. + bool ChangeOptionsForFileIngestionTest(); + + // Return the current option configuration. + Options CurrentOptions(const anon::OptionsOverride& options_override = + anon::OptionsOverride()) const; + + Options CurrentOptions(const Options& default_options, + const anon::OptionsOverride& options_override = + anon::OptionsOverride()) const; + + static Options GetDefaultOptions(); + + Options GetOptions(int option_config, + const Options& default_options = GetDefaultOptions(), + const anon::OptionsOverride& options_override = + anon::OptionsOverride()) const; + + DBImpl* dbfull() { return reinterpret_cast(db_); } + + void CreateColumnFamilies(const std::vector& cfs, + const Options& options); + + void CreateAndReopenWithCF(const std::vector& cfs, + const Options& options); + + void ReopenWithColumnFamilies(const std::vector& cfs, + const std::vector& options); + + void ReopenWithColumnFamilies(const std::vector& cfs, + const Options& options); + + Status TryReopenWithColumnFamilies(const std::vector& cfs, + const std::vector& options); + + Status TryReopenWithColumnFamilies(const std::vector& cfs, + const Options& options); + + void Reopen(const Options& options); + + void Close(); + + void DestroyAndReopen(const Options& options); + + void Destroy(const Options& options, bool delete_cf_paths = false); + + Status ReadOnlyReopen(const Options& options); + + Status TryReopen(const Options& options); + + bool IsDirectIOSupported(); + + bool IsMemoryMappedAccessSupported() const; + + Status Flush(int cf = 0); + + Status Flush(const std::vector& cf_ids); + + Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions()); + + Status Put(int cf, const Slice& k, const Slice& v, + WriteOptions wo = WriteOptions()); + + Status Merge(const Slice& k, const Slice& v, + WriteOptions wo = WriteOptions()); + + Status Merge(int cf, const Slice& k, const Slice& v, + WriteOptions wo = WriteOptions()); + + Status Delete(const std::string& k); + + Status Delete(int cf, const std::string& k); + + Status SingleDelete(const std::string& k); + + Status SingleDelete(int cf, const std::string& k); + + bool SetPreserveDeletesSequenceNumber(SequenceNumber sn); + + std::string Get(const std::string& k, const Snapshot* snapshot = nullptr); + + std::string Get(int cf, const std::string& k, + const Snapshot* snapshot = nullptr); + + Status Get(const std::string& k, PinnableSlice* v); + + std::vector MultiGet(std::vector cfs, + const std::vector& k, + const Snapshot* snapshot, + const bool batched); + + std::vector MultiGet(const std::vector& k, + const Snapshot* snapshot = nullptr); + + uint64_t GetNumSnapshots(); + + uint64_t GetTimeOldestSnapshots(); + + uint64_t GetSequenceOldestSnapshots(); + + // Return a string that contains all key,value pairs in order, + // formatted like "(k1->v1)(k2->v2)". + std::string Contents(int cf = 0); + + std::string AllEntriesFor(const Slice& user_key, int cf = 0); + +#ifndef ROCKSDB_LITE + int NumSortedRuns(int cf = 0); + + uint64_t TotalSize(int cf = 0); + + uint64_t SizeAtLevel(int level); + + size_t TotalLiveFiles(int cf = 0); + + size_t CountLiveFiles(); + + int NumTableFilesAtLevel(int level, int cf = 0); + + double CompressionRatioAtLevel(int level, int cf = 0); + + int TotalTableFiles(int cf = 0, int levels = -1); +#endif // ROCKSDB_LITE + + // Return spread of files per level + std::string FilesPerLevel(int cf = 0); + + size_t CountFiles(); + + uint64_t Size(const Slice& start, const Slice& limit, int cf = 0); + + void Compact(int cf, const Slice& start, const Slice& limit, + uint32_t target_path_id); + + void Compact(int cf, const Slice& start, const Slice& limit); + + void Compact(const Slice& start, const Slice& limit); + + // Do n memtable compactions, each of which produces an sstable + // covering the range [small,large]. + void MakeTables(int n, const std::string& small, const std::string& large, + int cf = 0); + + // Prevent pushing of new sstables into deeper levels by adding + // tables that cover a specified range to all levels. + void FillLevels(const std::string& smallest, const std::string& largest, + int cf); + + void MoveFilesToLevel(int level, int cf = 0); + +#ifndef ROCKSDB_LITE + void DumpFileCounts(const char* label); +#endif // ROCKSDB_LITE + + std::string DumpSSTableList(); + + static void GetSstFiles(Env* env, std::string path, + std::vector* files); + + int GetSstFileCount(std::string path); + + // this will generate non-overlapping files since it keeps increasing key_idx + void GenerateNewFile(Random* rnd, int* key_idx, bool nowait = false); + + void GenerateNewFile(int fd, Random* rnd, int* key_idx, bool nowait = false); + + static const int kNumKeysByGenerateNewRandomFile; + static const int KNumKeysByGenerateNewFile = 100; + + void GenerateNewRandomFile(Random* rnd, bool nowait = false); + + std::string IterStatus(Iterator* iter); + + Options OptionsForLogIterTest(); + + std::string DummyString(size_t len, char c = 'a'); + + void VerifyIterLast(std::string expected_key, int cf = 0); + + // Used to test InplaceUpdate + + // If previous value is nullptr or delta is > than previous value, + // sets newValue with delta + // If previous value is not empty, + // updates previous value with 'b' string of previous value size - 1. + static UpdateStatus updateInPlaceSmallerSize(char* prevValue, + uint32_t* prevSize, Slice delta, + std::string* newValue); + + static UpdateStatus updateInPlaceSmallerVarintSize(char* prevValue, + uint32_t* prevSize, + Slice delta, + std::string* newValue); + + static UpdateStatus updateInPlaceLargerSize(char* prevValue, + uint32_t* prevSize, Slice delta, + std::string* newValue); + + static UpdateStatus updateInPlaceNoAction(char* prevValue, uint32_t* prevSize, + Slice delta, std::string* newValue); + + // Utility method to test InplaceUpdate + void validateNumberOfEntries(int numValues, int cf = 0); + + void CopyFile(const std::string& source, const std::string& destination, + uint64_t size = 0); + + std::unordered_map GetAllSSTFiles( + uint64_t* total_size = nullptr); + + std::vector ListTableFiles(Env* env, const std::string& path); + + void VerifyDBFromMap( + std::map true_data, + size_t* total_reads_res = nullptr, bool tailing_iter = false, + std::map status = std::map()); + + void VerifyDBInternal( + std::vector> true_data); + +#ifndef ROCKSDB_LITE + uint64_t GetNumberOfSstFilesForColumnFamily(DB* db, + std::string column_family_name); +#endif // ROCKSDB_LITE + + uint64_t TestGetTickerCount(const Options& options, Tickers ticker_type) { + return options.statistics->getTickerCount(ticker_type); + } + + uint64_t TestGetAndResetTickerCount(const Options& options, + Tickers ticker_type) { + return options.statistics->getAndResetTickerCount(ticker_type); + } +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_universal_compaction_test.cc b/src/rocksdb/db/db_universal_compaction_test.cc new file mode 100644 index 000000000..61531ae16 --- /dev/null +++ b/src/rocksdb/db/db_universal_compaction_test.cc @@ -0,0 +1,2254 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#if !defined(ROCKSDB_LITE) +#include "rocksdb/utilities/table_properties_collectors.h" +#include "test_util/sync_point.h" + +namespace ROCKSDB_NAMESPACE { + +static std::string CompressibleString(Random* rnd, int len) { + std::string r; + test::CompressibleString(rnd, 0.8, len, &r); + return r; +} + +class DBTestUniversalCompactionBase + : public DBTestBase, + public ::testing::WithParamInterface> { + public: + explicit DBTestUniversalCompactionBase( + const std::string& path) : DBTestBase(path) {} + void SetUp() override { + num_levels_ = std::get<0>(GetParam()); + exclusive_manual_compaction_ = std::get<1>(GetParam()); + } + int num_levels_; + bool exclusive_manual_compaction_; +}; + +class DBTestUniversalCompaction : public DBTestUniversalCompactionBase { + public: + DBTestUniversalCompaction() : + DBTestUniversalCompactionBase("/db_universal_compaction_test") {} +}; + +class DBTestUniversalCompaction2 : public DBTestBase { + public: + DBTestUniversalCompaction2() : DBTestBase("/db_universal_compaction_test2") {} +}; + +namespace { +void VerifyCompactionResult( + const ColumnFamilyMetaData& cf_meta, + const std::set& overlapping_file_numbers) { +#ifndef NDEBUG + for (auto& level : cf_meta.levels) { + for (auto& file : level.files) { + assert(overlapping_file_numbers.find(file.name) == + overlapping_file_numbers.end()); + } + } +#endif +} + +class KeepFilter : public CompactionFilter { + public: + bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/, + std::string* /*new_value*/, + bool* /*value_changed*/) const override { + return false; + } + + const char* Name() const override { return "KeepFilter"; } +}; + +class KeepFilterFactory : public CompactionFilterFactory { + public: + explicit KeepFilterFactory(bool check_context = false) + : check_context_(check_context) {} + + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { + if (check_context_) { + EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction); + EXPECT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction); + } + return std::unique_ptr(new KeepFilter()); + } + + const char* Name() const override { return "KeepFilterFactory"; } + bool check_context_; + std::atomic_bool expect_full_compaction_; + std::atomic_bool expect_manual_compaction_; +}; + +class DelayFilter : public CompactionFilter { + public: + explicit DelayFilter(DBTestBase* d) : db_test(d) {} + bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/, + std::string* /*new_value*/, + bool* /*value_changed*/) const override { + db_test->env_->addon_time_.fetch_add(1000); + return true; + } + + const char* Name() const override { return "DelayFilter"; } + + private: + DBTestBase* db_test; +}; + +class DelayFilterFactory : public CompactionFilterFactory { + public: + explicit DelayFilterFactory(DBTestBase* d) : db_test(d) {} + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& /*context*/) override { + return std::unique_ptr(new DelayFilter(db_test)); + } + + const char* Name() const override { return "DelayFilterFactory"; } + + private: + DBTestBase* db_test; +}; +} // namespace + +// Make sure we don't trigger a problem if the trigger condtion is given +// to be 0, which is invalid. +TEST_P(DBTestUniversalCompaction, UniversalCompactionSingleSortedRun) { + Options options = CurrentOptions(); + + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = num_levels_; + // Config universal compaction to always compact to one single sorted run. + options.level0_file_num_compaction_trigger = 0; + options.compaction_options_universal.size_ratio = 10; + options.compaction_options_universal.min_merge_width = 2; + options.compaction_options_universal.max_size_amplification_percent = 0; + + options.write_buffer_size = 105 << 10; // 105KB + options.arena_block_size = 4 << 10; + options.target_file_size_base = 32 << 10; // 32KB + // trigger compaction if there are >= 4 files + KeepFilterFactory* filter = new KeepFilterFactory(true); + filter->expect_manual_compaction_.store(false); + options.compaction_filter_factory.reset(filter); + + DestroyAndReopen(options); + ASSERT_EQ(1, db_->GetOptions().level0_file_num_compaction_trigger); + + Random rnd(301); + int key_idx = 0; + + filter->expect_full_compaction_.store(true); + + for (int num = 0; num < 16; num++) { + // Write 100KB file. And immediately it should be compacted to one file. + GenerateNewFile(&rnd, &key_idx); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(NumSortedRuns(0), 1); + } + ASSERT_OK(Put(Key(key_idx), "")); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(NumSortedRuns(0), 1); +} + +TEST_P(DBTestUniversalCompaction, OptimizeFiltersForHits) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.compaction_options_universal.size_ratio = 5; + options.num_levels = num_levels_; + options.write_buffer_size = 105 << 10; // 105KB + options.arena_block_size = 4 << 10; + options.target_file_size_base = 32 << 10; // 32KB + // trigger compaction if there are >= 4 files + options.level0_file_num_compaction_trigger = 4; + BlockBasedTableOptions bbto; + bbto.cache_index_and_filter_blocks = true; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options.optimize_filters_for_hits = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.memtable_factory.reset(new SpecialSkipListFactory(3)); + + DestroyAndReopen(options); + + // block compaction from happening + env_->SetBackgroundThreads(1, Env::LOW); + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + + for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) { + Put(Key(num * 10), "val"); + if (num) { + dbfull()->TEST_WaitForFlushMemTable(); + } + Put(Key(30 + num * 10), "val"); + Put(Key(60 + num * 10), "val"); + } + Put("", ""); + dbfull()->TEST_WaitForFlushMemTable(); + + // Query set of non existing keys + for (int i = 5; i < 90; i += 10) { + ASSERT_EQ(Get(Key(i)), "NOT_FOUND"); + } + + // Make sure bloom filter is used at least once. + ASSERT_GT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + auto prev_counter = TestGetTickerCount(options, BLOOM_FILTER_USEFUL); + + // Make sure bloom filter is used for all but the last L0 file when looking + // up a non-existent key that's in the range of all L0 files. + ASSERT_EQ(Get(Key(35)), "NOT_FOUND"); + ASSERT_EQ(prev_counter + NumTableFilesAtLevel(0) - 1, + TestGetTickerCount(options, BLOOM_FILTER_USEFUL)); + prev_counter = TestGetTickerCount(options, BLOOM_FILTER_USEFUL); + + // Unblock compaction and wait it for happening. + sleeping_task_low.WakeUp(); + dbfull()->TEST_WaitForCompact(); + + // The same queries will not trigger bloom filter + for (int i = 5; i < 90; i += 10) { + ASSERT_EQ(Get(Key(i)), "NOT_FOUND"); + } + ASSERT_EQ(prev_counter, TestGetTickerCount(options, BLOOM_FILTER_USEFUL)); +} + +// TODO(kailiu) The tests on UniversalCompaction has some issues: +// 1. A lot of magic numbers ("11" or "12"). +// 2. Made assumption on the memtable flush conditions, which may change from +// time to time. +TEST_P(DBTestUniversalCompaction, UniversalCompactionTrigger) { + Options options; + options.compaction_style = kCompactionStyleUniversal; + options.compaction_options_universal.size_ratio = 5; + options.num_levels = num_levels_; + options.write_buffer_size = 105 << 10; // 105KB + options.arena_block_size = 4 << 10; + options.target_file_size_base = 32 << 10; // 32KB + // trigger compaction if there are >= 4 files + options.level0_file_num_compaction_trigger = 4; + KeepFilterFactory* filter = new KeepFilterFactory(true); + filter->expect_manual_compaction_.store(false); + options.compaction_filter_factory.reset(filter); + + options = CurrentOptions(options); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBTestWritableFile.GetPreallocationStatus", [&](void* arg) { + ASSERT_TRUE(arg != nullptr); + size_t preallocation_size = *(static_cast(arg)); + if (num_levels_ > 3) { + ASSERT_LE(preallocation_size, options.target_file_size_base * 1.1); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + int key_idx = 0; + + filter->expect_full_compaction_.store(true); + // Stage 1: + // Generate a set of files at level 0, but don't trigger level-0 + // compaction. + for (int num = 0; num < options.level0_file_num_compaction_trigger - 1; + num++) { + // Write 100KB + GenerateNewFile(1, &rnd, &key_idx); + } + + // Generate one more file at level-0, which should trigger level-0 + // compaction. + GenerateNewFile(1, &rnd, &key_idx); + // Suppose each file flushed from mem table has size 1. Now we compact + // (level0_file_num_compaction_trigger+1)=4 files and should have a big + // file of size 4. + ASSERT_EQ(NumSortedRuns(1), 1); + + // Stage 2: + // Now we have one file at level 0, with size 4. We also have some data in + // mem table. Let's continue generating new files at level 0, but don't + // trigger level-0 compaction. + // First, clean up memtable before inserting new data. This will generate + // a level-0 file, with size around 0.4 (according to previously written + // data amount). + filter->expect_full_compaction_.store(false); + ASSERT_OK(Flush(1)); + for (int num = 0; num < options.level0_file_num_compaction_trigger - 3; + num++) { + GenerateNewFile(1, &rnd, &key_idx); + ASSERT_EQ(NumSortedRuns(1), num + 3); + } + + // Generate one more file at level-0, which should trigger level-0 + // compaction. + GenerateNewFile(1, &rnd, &key_idx); + // Before compaction, we have 4 files at level 0, with size 4, 0.4, 1, 1. + // After compaction, we should have 2 files, with size 4, 2.4. + ASSERT_EQ(NumSortedRuns(1), 2); + + // Stage 3: + // Now we have 2 files at level 0, with size 4 and 2.4. Continue + // generating new files at level 0. + for (int num = 0; num < options.level0_file_num_compaction_trigger - 3; + num++) { + GenerateNewFile(1, &rnd, &key_idx); + ASSERT_EQ(NumSortedRuns(1), num + 3); + } + + // Generate one more file at level-0, which should trigger level-0 + // compaction. + GenerateNewFile(1, &rnd, &key_idx); + // Before compaction, we have 4 files at level 0, with size 4, 2.4, 1, 1. + // After compaction, we should have 3 files, with size 4, 2.4, 2. + ASSERT_EQ(NumSortedRuns(1), 3); + + // Stage 4: + // Now we have 3 files at level 0, with size 4, 2.4, 2. Let's generate a + // new file of size 1. + GenerateNewFile(1, &rnd, &key_idx); + dbfull()->TEST_WaitForCompact(); + // Level-0 compaction is triggered, but no file will be picked up. + ASSERT_EQ(NumSortedRuns(1), 4); + + // Stage 5: + // Now we have 4 files at level 0, with size 4, 2.4, 2, 1. Let's generate + // a new file of size 1. + filter->expect_full_compaction_.store(true); + GenerateNewFile(1, &rnd, &key_idx); + dbfull()->TEST_WaitForCompact(); + // All files at level 0 will be compacted into a single one. + ASSERT_EQ(NumSortedRuns(1), 1); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_P(DBTestUniversalCompaction, UniversalCompactionSizeAmplification) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = num_levels_; + options.write_buffer_size = 100 << 10; // 100KB + options.target_file_size_base = 32 << 10; // 32KB + options.level0_file_num_compaction_trigger = 3; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Trigger compaction if size amplification exceeds 110% + options.compaction_options_universal.max_size_amplification_percent = 110; + options = CurrentOptions(options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + Random rnd(301); + int key_idx = 0; + + // Generate two files in Level 0. Both files are approx the same size. + for (int num = 0; num < options.level0_file_num_compaction_trigger - 1; + num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + ASSERT_EQ(NumSortedRuns(1), num + 1); + } + ASSERT_EQ(NumSortedRuns(1), 2); + + // Flush whatever is remaining in memtable. This is typically + // small, which should not trigger size ratio based compaction + // but will instead trigger size amplification. + ASSERT_OK(Flush(1)); + + dbfull()->TEST_WaitForCompact(); + + // Verify that size amplification did occur + ASSERT_EQ(NumSortedRuns(1), 1); +} + +TEST_P(DBTestUniversalCompaction, DynamicUniversalCompactionSizeAmplification) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 1; + options.write_buffer_size = 100 << 10; // 100KB + options.target_file_size_base = 32 << 10; // 32KB + options.level0_file_num_compaction_trigger = 3; + // Initial setup of compaction_options_universal will prevent universal + // compaction from happening + options.compaction_options_universal.size_ratio = 100; + options.compaction_options_universal.min_merge_width = 100; + DestroyAndReopen(options); + + int total_picked_compactions = 0; + int total_size_amp_compactions = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) { + if (arg) { + total_picked_compactions++; + Compaction* c = static_cast(arg); + if (c->compaction_reason() == + CompactionReason::kUniversalSizeAmplification) { + total_size_amp_compactions++; + } + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + MutableCFOptions mutable_cf_options; + CreateAndReopenWithCF({"pikachu"}, options); + + Random rnd(301); + int key_idx = 0; + + // Generate two files in Level 0. Both files are approx the same size. + for (int num = 0; num < options.level0_file_num_compaction_trigger - 1; + num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + ASSERT_EQ(NumSortedRuns(1), num + 1); + } + ASSERT_EQ(NumSortedRuns(1), 2); + + // Flush whatever is remaining in memtable. This is typically + // small, which should not trigger size ratio based compaction + // but could instead trigger size amplification if it's set + // to 110. + ASSERT_OK(Flush(1)); + dbfull()->TEST_WaitForCompact(); + // Verify compaction did not happen + ASSERT_EQ(NumSortedRuns(1), 3); + + // Trigger compaction if size amplification exceeds 110% without reopening DB + ASSERT_EQ(dbfull() + ->GetOptions(handles_[1]) + .compaction_options_universal.max_size_amplification_percent, + 200U); + ASSERT_OK(dbfull()->SetOptions(handles_[1], + {{"compaction_options_universal", + "{max_size_amplification_percent=110;}"}})); + ASSERT_EQ(dbfull() + ->GetOptions(handles_[1]) + .compaction_options_universal.max_size_amplification_percent, + 110u); + ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1], + &mutable_cf_options)); + ASSERT_EQ(110u, mutable_cf_options.compaction_options_universal + .max_size_amplification_percent); + + dbfull()->TEST_WaitForCompact(); + // Verify that size amplification did happen + ASSERT_EQ(NumSortedRuns(1), 1); + ASSERT_EQ(total_picked_compactions, 1); + ASSERT_EQ(total_size_amp_compactions, 1); +} + +TEST_P(DBTestUniversalCompaction, DynamicUniversalCompactionReadAmplification) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 1; + options.write_buffer_size = 100 << 10; // 100KB + options.target_file_size_base = 32 << 10; // 32KB + options.level0_file_num_compaction_trigger = 3; + // Initial setup of compaction_options_universal will prevent universal + // compaction from happening + options.compaction_options_universal.max_size_amplification_percent = 2000; + options.compaction_options_universal.size_ratio = 0; + options.compaction_options_universal.min_merge_width = 100; + DestroyAndReopen(options); + + int total_picked_compactions = 0; + int total_size_ratio_compactions = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) { + if (arg) { + total_picked_compactions++; + Compaction* c = static_cast(arg); + if (c->compaction_reason() == CompactionReason::kUniversalSizeRatio) { + total_size_ratio_compactions++; + } + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + MutableCFOptions mutable_cf_options; + CreateAndReopenWithCF({"pikachu"}, options); + + Random rnd(301); + int key_idx = 0; + + // Generate three files in Level 0. All files are approx the same size. + for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + ASSERT_EQ(NumSortedRuns(1), num + 1); + } + ASSERT_EQ(NumSortedRuns(1), options.level0_file_num_compaction_trigger); + + // Flush whatever is remaining in memtable. This is typically small, about + // 30KB. + ASSERT_OK(Flush(1)); + dbfull()->TEST_WaitForCompact(); + // Verify compaction did not happen + ASSERT_EQ(NumSortedRuns(1), options.level0_file_num_compaction_trigger + 1); + ASSERT_EQ(total_picked_compactions, 0); + + ASSERT_OK(dbfull()->SetOptions( + handles_[1], + {{"compaction_options_universal", + "{min_merge_width=2;max_merge_width=2;size_ratio=100;}"}})); + ASSERT_EQ(dbfull() + ->GetOptions(handles_[1]) + .compaction_options_universal.min_merge_width, + 2u); + ASSERT_EQ(dbfull() + ->GetOptions(handles_[1]) + .compaction_options_universal.max_merge_width, + 2u); + ASSERT_EQ( + dbfull()->GetOptions(handles_[1]).compaction_options_universal.size_ratio, + 100u); + + ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1], + &mutable_cf_options)); + ASSERT_EQ(mutable_cf_options.compaction_options_universal.size_ratio, 100u); + ASSERT_EQ(mutable_cf_options.compaction_options_universal.min_merge_width, + 2u); + ASSERT_EQ(mutable_cf_options.compaction_options_universal.max_merge_width, + 2u); + + dbfull()->TEST_WaitForCompact(); + + // Files in L0 are approx: 0.3 (30KB), 1, 1, 1. + // On compaction: the files are below the size amp threshold, so we + // fallthrough to checking read amp conditions. The configured size ratio is + // not big enough to take 0.3 into consideration. So the next files 1 and 1 + // are compacted together first as they satisfy size ratio condition and + // (min_merge_width, max_merge_width) condition, to give out a file size of 2. + // Next, the newly generated 2 and the last file 1 are compacted together. So + // at the end: #sortedRuns = 2, #picked_compactions = 2, and all the picked + // ones are size ratio based compactions. + ASSERT_EQ(NumSortedRuns(1), 2); + // If max_merge_width had not been changed dynamically above, and if it + // continued to be the default value of UINIT_MAX, total_picked_compactions + // would have been 1. + ASSERT_EQ(total_picked_compactions, 2); + ASSERT_EQ(total_size_ratio_compactions, 2); +} + +TEST_P(DBTestUniversalCompaction, CompactFilesOnUniversalCompaction) { + const int kTestKeySize = 16; + const int kTestValueSize = 984; + const int kEntrySize = kTestKeySize + kTestValueSize; + const int kEntriesPerBuffer = 10; + + ChangeCompactOptions(); + Options options; + options.create_if_missing = true; + options.compaction_style = kCompactionStyleLevel; + options.num_levels = 1; + options.target_file_size_base = options.write_buffer_size; + options.compression = kNoCompression; + options = CurrentOptions(options); + options.write_buffer_size = kEntrySize * kEntriesPerBuffer; + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_EQ(options.compaction_style, kCompactionStyleUniversal); + Random rnd(301); + for (int key = 1024 * kEntriesPerBuffer; key >= 0; --key) { + ASSERT_OK(Put(1, ToString(key), RandomString(&rnd, kTestValueSize))); + } + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + dbfull()->TEST_WaitForCompact(); + ColumnFamilyMetaData cf_meta; + dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta); + std::vector compaction_input_file_names; + for (auto file : cf_meta.levels[0].files) { + if (rnd.OneIn(2)) { + compaction_input_file_names.push_back(file.name); + } + } + + if (compaction_input_file_names.size() == 0) { + compaction_input_file_names.push_back( + cf_meta.levels[0].files[0].name); + } + + // expect fail since universal compaction only allow L0 output + ASSERT_FALSE(dbfull() + ->CompactFiles(CompactionOptions(), handles_[1], + compaction_input_file_names, 1) + .ok()); + + // expect ok and verify the compacted files no longer exist. + ASSERT_OK(dbfull()->CompactFiles( + CompactionOptions(), handles_[1], + compaction_input_file_names, 0)); + + dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta); + VerifyCompactionResult( + cf_meta, + std::set(compaction_input_file_names.begin(), + compaction_input_file_names.end())); + + compaction_input_file_names.clear(); + + // Pick the first and the last file, expect everything is + // compacted into one single file. + compaction_input_file_names.push_back( + cf_meta.levels[0].files[0].name); + compaction_input_file_names.push_back( + cf_meta.levels[0].files[ + cf_meta.levels[0].files.size() - 1].name); + ASSERT_OK(dbfull()->CompactFiles( + CompactionOptions(), handles_[1], + compaction_input_file_names, 0)); + + dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta); + ASSERT_EQ(cf_meta.levels[0].files.size(), 1U); +} + +TEST_P(DBTestUniversalCompaction, UniversalCompactionTargetLevel) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.write_buffer_size = 100 << 10; // 100KB + options.num_levels = 7; + options.disable_auto_compactions = true; + DestroyAndReopen(options); + + // Generate 3 overlapping files + Random rnd(301); + for (int i = 0; i < 210; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 100))); + } + ASSERT_OK(Flush()); + + for (int i = 200; i < 300; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 100))); + } + ASSERT_OK(Flush()); + + for (int i = 250; i < 260; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 100))); + } + ASSERT_OK(Flush()); + + ASSERT_EQ("3", FilesPerLevel(0)); + // Compact all files into 1 file and put it in L4 + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 4; + compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; + db_->CompactRange(compact_options, nullptr, nullptr); + ASSERT_EQ("0,0,0,0,1", FilesPerLevel(0)); +} + +#ifndef ROCKSDB_VALGRIND_RUN +class DBTestUniversalCompactionMultiLevels + : public DBTestUniversalCompactionBase { + public: + DBTestUniversalCompactionMultiLevels() : + DBTestUniversalCompactionBase( + "/db_universal_compaction_multi_levels_test") {} +}; + +TEST_P(DBTestUniversalCompactionMultiLevels, UniversalCompactionMultiLevels) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = num_levels_; + options.write_buffer_size = 100 << 10; // 100KB + options.level0_file_num_compaction_trigger = 8; + options.max_background_compactions = 3; + options.target_file_size_base = 32 * 1024; + CreateAndReopenWithCF({"pikachu"}, options); + + // Trigger compaction if size amplification exceeds 110% + options.compaction_options_universal.max_size_amplification_percent = 110; + options = CurrentOptions(options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + Random rnd(301); + int num_keys = 100000; + for (int i = 0; i < num_keys * 2; i++) { + ASSERT_OK(Put(1, Key(i % num_keys), Key(i))); + } + + dbfull()->TEST_WaitForCompact(); + + for (int i = num_keys; i < num_keys * 2; i++) { + ASSERT_EQ(Get(1, Key(i % num_keys)), Key(i)); + } +} + +// Tests universal compaction with trivial move enabled +TEST_P(DBTestUniversalCompactionMultiLevels, UniversalCompactionTrivialMove) { + int32_t trivial_move = 0; + int32_t non_trivial_move = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* /*arg*/) { trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial", [&](void* arg) { + non_trivial_move++; + ASSERT_TRUE(arg != nullptr); + int output_level = *(static_cast(arg)); + ASSERT_EQ(output_level, 0); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.compaction_options_universal.allow_trivial_move = true; + options.num_levels = 3; + options.write_buffer_size = 100 << 10; // 100KB + options.level0_file_num_compaction_trigger = 3; + options.max_background_compactions = 2; + options.target_file_size_base = 32 * 1024; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Trigger compaction if size amplification exceeds 110% + options.compaction_options_universal.max_size_amplification_percent = 110; + options = CurrentOptions(options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + Random rnd(301); + int num_keys = 150000; + for (int i = 0; i < num_keys; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + std::vector values; + + ASSERT_OK(Flush(1)); + dbfull()->TEST_WaitForCompact(); + + ASSERT_GT(trivial_move, 0); + ASSERT_GT(non_trivial_move, 0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +INSTANTIATE_TEST_CASE_P(MultiLevels, DBTestUniversalCompactionMultiLevels, + ::testing::Combine(::testing::Values(3, 20), + ::testing::Bool())); + +class DBTestUniversalCompactionParallel : + public DBTestUniversalCompactionBase { + public: + DBTestUniversalCompactionParallel() : + DBTestUniversalCompactionBase( + "/db_universal_compaction_prallel_test") {} +}; + +TEST_P(DBTestUniversalCompactionParallel, UniversalCompactionParallel) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = num_levels_; + options.write_buffer_size = 1 << 10; // 1KB + options.level0_file_num_compaction_trigger = 3; + options.max_background_compactions = 3; + options.max_background_flushes = 3; + options.target_file_size_base = 1 * 1024; + options.compaction_options_universal.max_size_amplification_percent = 110; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Delay every compaction so multiple compactions will happen. + std::atomic num_compactions_running(0); + std::atomic has_parallel(false); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::Run():Start", [&](void* /*arg*/) { + if (num_compactions_running.fetch_add(1) > 0) { + has_parallel.store(true); + return; + } + for (int nwait = 0; nwait < 20000; nwait++) { + if (has_parallel.load() || num_compactions_running.load() > 1) { + has_parallel.store(true); + break; + } + env_->SleepForMicroseconds(1000); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::Run():End", + [&](void* /*arg*/) { num_compactions_running.fetch_add(-1); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + options = CurrentOptions(options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + Random rnd(301); + int num_keys = 30000; + for (int i = 0; i < num_keys * 2; i++) { + ASSERT_OK(Put(1, Key(i % num_keys), Key(i))); + } + dbfull()->TEST_WaitForCompact(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ASSERT_EQ(num_compactions_running.load(), 0); + ASSERT_TRUE(has_parallel.load()); + + for (int i = num_keys; i < num_keys * 2; i++) { + ASSERT_EQ(Get(1, Key(i % num_keys)), Key(i)); + } + + // Reopen and check. + ReopenWithColumnFamilies({"default", "pikachu"}, options); + for (int i = num_keys; i < num_keys * 2; i++) { + ASSERT_EQ(Get(1, Key(i % num_keys)), Key(i)); + } +} + +TEST_P(DBTestUniversalCompactionParallel, PickByFileNumberBug) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = num_levels_; + options.write_buffer_size = 1 * 1024; // 1KB + options.level0_file_num_compaction_trigger = 7; + options.max_background_compactions = 2; + options.target_file_size_base = 1024 * 1024; // 1MB + + // Disable size amplifiction compaction + options.compaction_options_universal.max_size_amplification_percent = + UINT_MAX; + DestroyAndReopen(options); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBTestUniversalCompactionParallel::PickByFileNumberBug:0", + "BackgroundCallCompaction:0"}, + {"UniversalCompactionBuilder::PickCompaction:Return", + "DBTestUniversalCompactionParallel::PickByFileNumberBug:1"}, + {"DBTestUniversalCompactionParallel::PickByFileNumberBug:2", + "CompactionJob::Run():Start"}}); + + int total_picked_compactions = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) { + if (arg) { + total_picked_compactions++; + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Write 7 files to trigger compaction + int key_idx = 1; + for (int i = 1; i <= 70; i++) { + std::string k = Key(key_idx++); + ASSERT_OK(Put(k, k)); + if (i % 10 == 0) { + ASSERT_OK(Flush()); + } + } + + // Wait for the 1st background compaction process to start + TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:0"); + TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:1"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace(); + + // Write 3 files while 1st compaction is held + // These 3 files have different sizes to avoid compacting based on size_ratio + int num_keys = 1000; + for (int i = 0; i < 3; i++) { + for (int j = 1; j <= num_keys; j++) { + std::string k = Key(key_idx++); + ASSERT_OK(Put(k, k)); + } + ASSERT_OK(Flush()); + num_keys -= 100; + } + + // Hold the 1st compaction from finishing + TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:2"); + dbfull()->TEST_WaitForCompact(); + + // There should only be one picked compaction as the score drops below one + // after the first one is picked. + EXPECT_EQ(total_picked_compactions, 1); + EXPECT_EQ(TotalTableFiles(), 4); + + // Stop SyncPoint and destroy the DB and reopen it again + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + key_idx = 1; + total_picked_compactions = 0; + DestroyAndReopen(options); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Write 7 files to trigger compaction + for (int i = 1; i <= 70; i++) { + std::string k = Key(key_idx++); + ASSERT_OK(Put(k, k)); + if (i % 10 == 0) { + ASSERT_OK(Flush()); + } + } + + // Wait for the 1st background compaction process to start + TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:0"); + TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:1"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace(); + + // Write 8 files while 1st compaction is held + // These 8 files have different sizes to avoid compacting based on size_ratio + num_keys = 1000; + for (int i = 0; i < 8; i++) { + for (int j = 1; j <= num_keys; j++) { + std::string k = Key(key_idx++); + ASSERT_OK(Put(k, k)); + } + ASSERT_OK(Flush()); + num_keys -= 100; + } + + // Wait for the 2nd background compaction process to start + TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:0"); + TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:1"); + + // Hold the 1st and 2nd compaction from finishing + TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:2"); + dbfull()->TEST_WaitForCompact(); + + // This time we will trigger a compaction because of size ratio and + // another compaction because of number of files that are not compacted + // greater than 7 + EXPECT_GE(total_picked_compactions, 2); +} + +INSTANTIATE_TEST_CASE_P(Parallel, DBTestUniversalCompactionParallel, + ::testing::Combine(::testing::Values(1, 10), + ::testing::Values(false))); +#endif // ROCKSDB_VALGRIND_RUN + +TEST_P(DBTestUniversalCompaction, UniversalCompactionOptions) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.write_buffer_size = 105 << 10; // 105KB + options.arena_block_size = 4 << 10; // 4KB + options.target_file_size_base = 32 << 10; // 32KB + options.level0_file_num_compaction_trigger = 4; + options.num_levels = num_levels_; + options.compaction_options_universal.compression_size_percent = -1; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + Random rnd(301); + int key_idx = 0; + + for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) { + // Write 100KB (100 values, each 1K) + for (int i = 0; i < 100; i++) { + ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 990))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + + if (num < options.level0_file_num_compaction_trigger - 1) { + ASSERT_EQ(NumSortedRuns(1), num + 1); + } + } + + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(NumSortedRuns(1), 1); +} + +TEST_P(DBTestUniversalCompaction, UniversalCompactionStopStyleSimilarSize) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.write_buffer_size = 105 << 10; // 105KB + options.arena_block_size = 4 << 10; // 4KB + options.target_file_size_base = 32 << 10; // 32KB + // trigger compaction if there are >= 4 files + options.level0_file_num_compaction_trigger = 4; + options.compaction_options_universal.size_ratio = 10; + options.compaction_options_universal.stop_style = + kCompactionStopStyleSimilarSize; + options.num_levels = num_levels_; + DestroyAndReopen(options); + + Random rnd(301); + int key_idx = 0; + + // Stage 1: + // Generate a set of files at level 0, but don't trigger level-0 + // compaction. + for (int num = 0; num < options.level0_file_num_compaction_trigger - 1; + num++) { + // Write 100KB (100 values, each 1K) + for (int i = 0; i < 100; i++) { + ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 990))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(NumSortedRuns(), num + 1); + } + + // Generate one more file at level-0, which should trigger level-0 + // compaction. + for (int i = 0; i < 100; i++) { + ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 990))); + key_idx++; + } + dbfull()->TEST_WaitForCompact(); + // Suppose each file flushed from mem table has size 1. Now we compact + // (level0_file_num_compaction_trigger+1)=4 files and should have a big + // file of size 4. + ASSERT_EQ(NumSortedRuns(), 1); + + // Stage 2: + // Now we have one file at level 0, with size 4. We also have some data in + // mem table. Let's continue generating new files at level 0, but don't + // trigger level-0 compaction. + // First, clean up memtable before inserting new data. This will generate + // a level-0 file, with size around 0.4 (according to previously written + // data amount). + dbfull()->Flush(FlushOptions()); + for (int num = 0; num < options.level0_file_num_compaction_trigger - 3; + num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 100; i++) { + ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 990))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(NumSortedRuns(), num + 3); + } + + // Generate one more file at level-0, which should trigger level-0 + // compaction. + for (int i = 0; i < 100; i++) { + ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 990))); + key_idx++; + } + dbfull()->TEST_WaitForCompact(); + // Before compaction, we have 4 files at level 0, with size 4, 0.4, 1, 1. + // After compaction, we should have 3 files, with size 4, 0.4, 2. + ASSERT_EQ(NumSortedRuns(), 3); + // Stage 3: + // Now we have 3 files at level 0, with size 4, 0.4, 2. Generate one + // more file at level-0, which should trigger level-0 compaction. + for (int i = 0; i < 100; i++) { + ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 990))); + key_idx++; + } + dbfull()->TEST_WaitForCompact(); + // Level-0 compaction is triggered, but no file will be picked up. + ASSERT_EQ(NumSortedRuns(), 4); +} + +TEST_P(DBTestUniversalCompaction, UniversalCompactionCompressRatio1) { + if (!Snappy_Supported()) { + return; + } + + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.write_buffer_size = 100 << 10; // 100KB + options.target_file_size_base = 32 << 10; // 32KB + options.level0_file_num_compaction_trigger = 2; + options.num_levels = num_levels_; + options.compaction_options_universal.compression_size_percent = 70; + DestroyAndReopen(options); + + Random rnd(301); + int key_idx = 0; + + // The first compaction (2) is compressed. + for (int num = 0; num < 2; num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + } + ASSERT_LT(TotalSize(), 110000U * 2 * 0.9); + + // The second compaction (4) is compressed + for (int num = 0; num < 2; num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + } + ASSERT_LT(TotalSize(), 110000 * 4 * 0.9); + + // The third compaction (2 4) is compressed since this time it is + // (1 1 3.2) and 3.2/5.2 doesn't reach ratio. + for (int num = 0; num < 2; num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + } + ASSERT_LT(TotalSize(), 110000 * 6 * 0.9); + + // When we start for the compaction up to (2 4 8), the latest + // compressed is not compressed. + for (int num = 0; num < 8; num++) { + // Write 110KB (11 values, each 10K) + for (int i = 0; i < 11; i++) { + ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + } + ASSERT_GT(TotalSize(), 110000 * 11 * 0.8 + 110000 * 2); +} + +TEST_P(DBTestUniversalCompaction, UniversalCompactionCompressRatio2) { + if (!Snappy_Supported()) { + return; + } + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.write_buffer_size = 100 << 10; // 100KB + options.target_file_size_base = 32 << 10; // 32KB + options.level0_file_num_compaction_trigger = 2; + options.num_levels = num_levels_; + options.compaction_options_universal.compression_size_percent = 95; + DestroyAndReopen(options); + + Random rnd(301); + int key_idx = 0; + + // When we start for the compaction up to (2 4 8), the latest + // compressed is compressed given the size ratio to compress. + for (int num = 0; num < 14; num++) { + // Write 120KB (12 values, each 10K) + for (int i = 0; i < 12; i++) { + ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000))); + key_idx++; + } + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + } + ASSERT_LT(TotalSize(), 120000U * 12 * 0.82 + 120000 * 2); +} + +#ifndef ROCKSDB_VALGRIND_RUN +// Test that checks trivial move in universal compaction +TEST_P(DBTestUniversalCompaction, UniversalCompactionTrivialMoveTest1) { + int32_t trivial_move = 0; + int32_t non_trivial_move = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* /*arg*/) { trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial", [&](void* arg) { + non_trivial_move++; + ASSERT_TRUE(arg != nullptr); + int output_level = *(static_cast(arg)); + ASSERT_EQ(output_level, 0); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.compaction_options_universal.allow_trivial_move = true; + options.num_levels = 2; + options.write_buffer_size = 100 << 10; // 100KB + options.level0_file_num_compaction_trigger = 3; + options.max_background_compactions = 1; + options.target_file_size_base = 32 * 1024; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Trigger compaction if size amplification exceeds 110% + options.compaction_options_universal.max_size_amplification_percent = 110; + options = CurrentOptions(options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + Random rnd(301); + int num_keys = 250000; + for (int i = 0; i < num_keys; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + std::vector values; + + ASSERT_OK(Flush(1)); + dbfull()->TEST_WaitForCompact(); + + ASSERT_GT(trivial_move, 0); + ASSERT_GT(non_trivial_move, 0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} +// Test that checks trivial move in universal compaction +TEST_P(DBTestUniversalCompaction, UniversalCompactionTrivialMoveTest2) { + int32_t trivial_move = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* /*arg*/) { trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial", [&](void* arg) { + ASSERT_TRUE(arg != nullptr); + int output_level = *(static_cast(arg)); + ASSERT_EQ(output_level, 0); + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.compaction_options_universal.allow_trivial_move = true; + options.num_levels = 15; + options.write_buffer_size = 100 << 10; // 100KB + options.level0_file_num_compaction_trigger = 8; + options.max_background_compactions = 2; + options.target_file_size_base = 64 * 1024; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Trigger compaction if size amplification exceeds 110% + options.compaction_options_universal.max_size_amplification_percent = 110; + options = CurrentOptions(options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + Random rnd(301); + int num_keys = 500000; + for (int i = 0; i < num_keys; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + std::vector values; + + ASSERT_OK(Flush(1)); + dbfull()->TEST_WaitForCompact(); + + ASSERT_GT(trivial_move, 0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} +#endif // ROCKSDB_VALGRIND_RUN + +TEST_P(DBTestUniversalCompaction, UniversalCompactionFourPaths) { + Options options = CurrentOptions(); + options.db_paths.emplace_back(dbname_, 300 * 1024); + options.db_paths.emplace_back(dbname_ + "_2", 300 * 1024); + options.db_paths.emplace_back(dbname_ + "_3", 500 * 1024); + options.db_paths.emplace_back(dbname_ + "_4", 1024 * 1024 * 1024); + options.memtable_factory.reset( + new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + options.compaction_style = kCompactionStyleUniversal; + options.compaction_options_universal.size_ratio = 5; + options.write_buffer_size = 111 << 10; // 114KB + options.arena_block_size = 4 << 10; + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 1; + + std::vector filenames; + env_->GetChildren(options.db_paths[1].path, &filenames); + // Delete archival files. + for (size_t i = 0; i < filenames.size(); ++i) { + env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]); + } + env_->DeleteDir(options.db_paths[1].path); + Reopen(options); + + Random rnd(301); + int key_idx = 0; + + // First three 110KB files are not going to second path. + // After that, (100K, 200K) + for (int num = 0; num < 3; num++) { + GenerateNewFile(&rnd, &key_idx); + } + + // Another 110KB triggers a compaction to 400K file to second path + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path)); + + // (1, 4) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1,1,4) -> (2, 4) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + // (1, 2, 4) -> (3, 4) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + // (1, 3, 4) -> (8) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path)); + + // (1, 8) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 1, 8) -> (2, 8) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + + // (1, 2, 8) -> (3, 8) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + // (1, 3, 8) -> (4, 8) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path)); + + // (1, 4, 8) -> (5, 8) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + for (int i = 0; i < key_idx; i++) { + auto v = Get(Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + + Reopen(options); + + for (int i = 0; i < key_idx; i++) { + auto v = Get(Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + + Destroy(options); +} + +TEST_P(DBTestUniversalCompaction, UniversalCompactionCFPathUse) { + Options options = CurrentOptions(); + options.db_paths.emplace_back(dbname_, 300 * 1024); + options.db_paths.emplace_back(dbname_ + "_2", 300 * 1024); + options.db_paths.emplace_back(dbname_ + "_3", 500 * 1024); + options.db_paths.emplace_back(dbname_ + "_4", 1024 * 1024 * 1024); + options.memtable_factory.reset( + new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + options.compaction_style = kCompactionStyleUniversal; + options.compaction_options_universal.size_ratio = 10; + options.write_buffer_size = 111 << 10; // 114KB + options.arena_block_size = 4 << 10; + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 1; + + std::vector option_vector; + option_vector.emplace_back(options); + ColumnFamilyOptions cf_opt1(options), cf_opt2(options); + // Configure CF1 specific paths. + cf_opt1.cf_paths.emplace_back(dbname_ + "cf1", 300 * 1024); + cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_2", 300 * 1024); + cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_3", 500 * 1024); + cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_4", 1024 * 1024 * 1024); + option_vector.emplace_back(DBOptions(options), cf_opt1); + CreateColumnFamilies({"one"},option_vector[1]); + + // Configura CF2 specific paths. + cf_opt2.cf_paths.emplace_back(dbname_ + "cf2", 300 * 1024); + cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_2", 300 * 1024); + cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_3", 500 * 1024); + cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_4", 1024 * 1024 * 1024); + option_vector.emplace_back(DBOptions(options), cf_opt2); + CreateColumnFamilies({"two"},option_vector[2]); + + ReopenWithColumnFamilies({"default", "one", "two"}, option_vector); + + Random rnd(301); + int key_idx = 0; + int key_idx1 = 0; + int key_idx2 = 0; + + auto generate_file = [&]() { + GenerateNewFile(0, &rnd, &key_idx); + GenerateNewFile(1, &rnd, &key_idx1); + GenerateNewFile(2, &rnd, &key_idx2); + }; + + auto check_sstfilecount = [&](int path_id, int expected) { + ASSERT_EQ(expected, GetSstFileCount(options.db_paths[path_id].path)); + ASSERT_EQ(expected, GetSstFileCount(cf_opt1.cf_paths[path_id].path)); + ASSERT_EQ(expected, GetSstFileCount(cf_opt2.cf_paths[path_id].path)); + }; + + auto check_getvalues = [&]() { + for (int i = 0; i < key_idx; i++) { + auto v = Get(0, Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + + for (int i = 0; i < key_idx1; i++) { + auto v = Get(1, Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + + for (int i = 0; i < key_idx2; i++) { + auto v = Get(2, Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + }; + + // First three 110KB files are not going to second path. + // After that, (100K, 200K) + for (int num = 0; num < 3; num++) { + generate_file(); + } + + // Another 110KB triggers a compaction to 400K file to second path + generate_file(); + check_sstfilecount(2, 1); + + // (1, 4) + generate_file(); + check_sstfilecount(2, 1); + check_sstfilecount(0, 1); + + // (1,1,4) -> (2, 4) + generate_file(); + check_sstfilecount(2, 1); + check_sstfilecount(1, 1); + check_sstfilecount(0, 0); + + // (1, 2, 4) -> (3, 4) + generate_file(); + check_sstfilecount(2, 1); + check_sstfilecount(1, 1); + check_sstfilecount(0, 0); + + // (1, 3, 4) -> (8) + generate_file(); + check_sstfilecount(3, 1); + + // (1, 8) + generate_file(); + check_sstfilecount(3, 1); + check_sstfilecount(0, 1); + + // (1, 1, 8) -> (2, 8) + generate_file(); + check_sstfilecount(3, 1); + check_sstfilecount(1, 1); + + // (1, 2, 8) -> (3, 8) + generate_file(); + check_sstfilecount(3, 1); + check_sstfilecount(1, 1); + check_sstfilecount(0, 0); + + // (1, 3, 8) -> (4, 8) + generate_file(); + check_sstfilecount(2, 1); + check_sstfilecount(3, 1); + + // (1, 4, 8) -> (5, 8) + generate_file(); + check_sstfilecount(3, 1); + check_sstfilecount(2, 1); + check_sstfilecount(0, 0); + + check_getvalues(); + + ReopenWithColumnFamilies({"default", "one", "two"}, option_vector); + + check_getvalues(); + + Destroy(options, true); +} + +TEST_P(DBTestUniversalCompaction, IncreaseUniversalCompactionNumLevels) { + std::function verify_func = [&](int num_keys_in_db) { + std::string keys_in_db; + Iterator* iter = dbfull()->NewIterator(ReadOptions(), handles_[1]); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + keys_in_db.append(iter->key().ToString()); + keys_in_db.push_back(','); + } + delete iter; + + std::string expected_keys; + for (int i = 0; i <= num_keys_in_db; i++) { + expected_keys.append(Key(i)); + expected_keys.push_back(','); + } + + ASSERT_EQ(keys_in_db, expected_keys); + }; + + Random rnd(301); + int max_key1 = 200; + int max_key2 = 600; + int max_key3 = 800; + const int KNumKeysPerFile = 10; + + // Stage 1: open a DB with universal compaction, num_levels=1 + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 1; + options.write_buffer_size = 200 << 10; // 200KB + options.level0_file_num_compaction_trigger = 3; + options.memtable_factory.reset(new SpecialSkipListFactory(KNumKeysPerFile)); + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, options); + + for (int i = 0; i <= max_key1; i++) { + // each value is 10K + ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000))); + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + dbfull()->TEST_WaitForCompact(); + } + ASSERT_OK(Flush(1)); + dbfull()->TEST_WaitForCompact(); + + // Stage 2: reopen with universal compaction, num_levels=4 + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 4; + options = CurrentOptions(options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + verify_func(max_key1); + + // Insert more keys + for (int i = max_key1 + 1; i <= max_key2; i++) { + // each value is 10K + ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000))); + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + dbfull()->TEST_WaitForCompact(); + } + ASSERT_OK(Flush(1)); + dbfull()->TEST_WaitForCompact(); + + verify_func(max_key2); + // Compaction to non-L0 has happened. + ASSERT_GT(NumTableFilesAtLevel(options.num_levels - 1, 1), 0); + + // Stage 3: Revert it back to one level and revert to num_levels=1. + options.num_levels = 4; + options.target_file_size_base = INT_MAX; + ReopenWithColumnFamilies({"default", "pikachu"}, options); + // Compact all to level 0 + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 0; + compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; + dbfull()->CompactRange(compact_options, handles_[1], nullptr, nullptr); + // Need to restart it once to remove higher level records in manifest. + ReopenWithColumnFamilies({"default", "pikachu"}, options); + // Final reopen + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 1; + options = CurrentOptions(options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + // Insert more keys + for (int i = max_key2 + 1; i <= max_key3; i++) { + // each value is 10K + ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000))); + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + dbfull()->TEST_WaitForCompact(); + } + ASSERT_OK(Flush(1)); + dbfull()->TEST_WaitForCompact(); + verify_func(max_key3); +} + + +TEST_P(DBTestUniversalCompaction, UniversalCompactionSecondPathRatio) { + if (!Snappy_Supported()) { + return; + } + Options options = CurrentOptions(); + options.db_paths.emplace_back(dbname_, 500 * 1024); + options.db_paths.emplace_back(dbname_ + "_2", 1024 * 1024 * 1024); + options.compaction_style = kCompactionStyleUniversal; + options.compaction_options_universal.size_ratio = 5; + options.write_buffer_size = 111 << 10; // 114KB + options.arena_block_size = 4 << 10; + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 1; + options.memtable_factory.reset( + new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + + std::vector filenames; + env_->GetChildren(options.db_paths[1].path, &filenames); + // Delete archival files. + for (size_t i = 0; i < filenames.size(); ++i) { + env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]); + } + env_->DeleteDir(options.db_paths[1].path); + Reopen(options); + + Random rnd(301); + int key_idx = 0; + + // First three 110KB files are not going to second path. + // After that, (100K, 200K) + for (int num = 0; num < 3; num++) { + GenerateNewFile(&rnd, &key_idx); + } + + // Another 110KB triggers a compaction to 400K file to second path + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + + // (1, 4) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1,1,4) -> (2, 4) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 2, 4) -> (3, 4) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + // (1, 3, 4) -> (8) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + // (1, 8) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 1, 8) -> (2, 8) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 2, 8) -> (3, 8) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + // (1, 3, 8) -> (4, 8) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + // (1, 4, 8) -> (5, 8) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + for (int i = 0; i < key_idx; i++) { + auto v = Get(Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + + Reopen(options); + + for (int i = 0; i < key_idx; i++) { + auto v = Get(Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 990); + } + + Destroy(options); +} + +TEST_P(DBTestUniversalCompaction, ConcurrentBottomPriLowPriCompactions) { + if (num_levels_ == 1) { + // for single-level universal, everything's bottom level so nothing should + // be executed in bottom-pri thread pool. + return; + } + const int kNumFilesTrigger = 3; + Env::Default()->SetBackgroundThreads(1, Env::Priority::BOTTOM); + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = num_levels_; + options.write_buffer_size = 100 << 10; // 100KB + options.target_file_size_base = 32 << 10; // 32KB + options.level0_file_num_compaction_trigger = kNumFilesTrigger; + // Trigger compaction if size amplification exceeds 110% + options.compaction_options_universal.max_size_amplification_percent = 110; + DestroyAndReopen(options); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {// wait for the full compaction to be picked before adding files intended + // for the second one. + {"DBImpl::BackgroundCompaction:ForwardToBottomPriPool", + "DBTestUniversalCompaction:ConcurrentBottomPriLowPriCompactions:0"}, + // the full (bottom-pri) compaction waits until a partial (low-pri) + // compaction has started to verify they can run in parallel. + {"DBImpl::BackgroundCompaction:NonTrivial", + "DBImpl::BGWorkBottomCompaction"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + for (int i = 0; i < 2; ++i) { + for (int num = 0; num < kNumFilesTrigger; num++) { + int key_idx = 0; + GenerateNewFile(&rnd, &key_idx, true /* no_wait */); + // use no_wait above because that one waits for flush and compaction. We + // don't want to wait for compaction because the full compaction is + // intentionally blocked while more files are flushed. + dbfull()->TEST_WaitForFlushMemTable(); + } + if (i == 0) { + TEST_SYNC_POINT( + "DBTestUniversalCompaction:ConcurrentBottomPriLowPriCompactions:0"); + } + } + dbfull()->TEST_WaitForCompact(); + + // First compaction should output to bottom level. Second should output to L0 + // since older L0 files pending compaction prevent it from being placed lower. + ASSERT_EQ(NumSortedRuns(), 2); + ASSERT_GT(NumTableFilesAtLevel(0), 0); + ASSERT_GT(NumTableFilesAtLevel(num_levels_ - 1), 0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + Env::Default()->SetBackgroundThreads(0, Env::Priority::BOTTOM); +} + +TEST_P(DBTestUniversalCompaction, RecalculateScoreAfterPicking) { + // Regression test for extra compactions scheduled. Once enough compactions + // have been scheduled to bring the score below one, we should stop + // scheduling more; otherwise, other CFs/DBs may be delayed unnecessarily. + const int kNumFilesTrigger = 8; + Options options = CurrentOptions(); + options.memtable_factory.reset( + new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + options.compaction_options_universal.max_merge_width = kNumFilesTrigger / 2; + options.compaction_options_universal.max_size_amplification_percent = + static_cast(-1); + options.compaction_style = kCompactionStyleUniversal; + options.level0_file_num_compaction_trigger = kNumFilesTrigger; + options.num_levels = num_levels_; + Reopen(options); + + std::atomic num_compactions_attempted(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:Start", + [&](void* /*arg*/) { ++num_compactions_attempted; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + for (int num = 0; num < kNumFilesTrigger; num++) { + ASSERT_EQ(NumSortedRuns(), num); + int key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + } + dbfull()->TEST_WaitForCompact(); + // Compacting the first four files was enough to bring the score below one so + // there's no need to schedule any more compactions. + ASSERT_EQ(1, num_compactions_attempted); + ASSERT_EQ(NumSortedRuns(), 5); +} + +TEST_P(DBTestUniversalCompaction, FinalSortedRunCompactFilesConflict) { + // Regression test for conflict between: + // (1) Running CompactFiles including file in the final sorted run; and + // (2) Picking universal size-amp-triggered compaction, which always includes + // the final sorted run. + if (exclusive_manual_compaction_) { + return; + } + + Options opts = CurrentOptions(); + opts.compaction_style = kCompactionStyleUniversal; + opts.compaction_options_universal.max_size_amplification_percent = 50; + opts.compaction_options_universal.min_merge_width = 2; + opts.compression = kNoCompression; + opts.level0_file_num_compaction_trigger = 2; + opts.max_background_compactions = 2; + opts.num_levels = num_levels_; + Reopen(opts); + + // make sure compaction jobs can be parallelized + auto stop_token = + dbfull()->TEST_write_controler().GetCompactionPressureToken(); + + Put("key", "val"); + Flush(); + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_EQ(NumTableFilesAtLevel(num_levels_ - 1), 1); + ColumnFamilyMetaData cf_meta; + ColumnFamilyHandle* default_cfh = db_->DefaultColumnFamily(); + dbfull()->GetColumnFamilyMetaData(default_cfh, &cf_meta); + ASSERT_EQ(1, cf_meta.levels[num_levels_ - 1].files.size()); + std::string first_sst_filename = + cf_meta.levels[num_levels_ - 1].files[0].name; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"CompactFilesImpl:0", + "DBTestUniversalCompaction:FinalSortedRunCompactFilesConflict:0"}, + {"DBImpl::BackgroundCompaction():AfterPickCompaction", + "CompactFilesImpl:1"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + port::Thread compact_files_thread([&]() { + ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), default_cfh, + {first_sst_filename}, num_levels_ - 1)); + }); + + TEST_SYNC_POINT( + "DBTestUniversalCompaction:FinalSortedRunCompactFilesConflict:0"); + for (int i = 0; i < 2; ++i) { + Put("key", "val"); + Flush(); + } + dbfull()->TEST_WaitForCompact(); + + compact_files_thread.join(); +} + +INSTANTIATE_TEST_CASE_P(NumLevels, DBTestUniversalCompaction, + ::testing::Combine(::testing::Values(1, 3, 5), + ::testing::Bool())); + +class DBTestUniversalManualCompactionOutputPathId + : public DBTestUniversalCompactionBase { + public: + DBTestUniversalManualCompactionOutputPathId() : + DBTestUniversalCompactionBase( + "/db_universal_compaction_manual_pid_test") {} +}; + +TEST_P(DBTestUniversalManualCompactionOutputPathId, + ManualCompactionOutputPathId) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.db_paths.emplace_back(dbname_, 1000000000); + options.db_paths.emplace_back(dbname_ + "_2", 1000000000); + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = num_levels_; + options.target_file_size_base = 1 << 30; // Big size + options.level0_file_num_compaction_trigger = 10; + Destroy(options); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + MakeTables(3, "p", "q", 1); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(2, TotalLiveFiles(1)); + ASSERT_EQ(2, GetSstFileCount(options.db_paths[0].path)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[1].path)); + + // Full compaction to DB path 0 + CompactRangeOptions compact_options; + compact_options.target_path_id = 1; + compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; + db_->CompactRange(compact_options, handles_[1], nullptr, nullptr); + ASSERT_EQ(1, TotalLiveFiles(1)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options); + ASSERT_EQ(1, TotalLiveFiles(1)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + + MakeTables(1, "p", "q", 1); + ASSERT_EQ(2, TotalLiveFiles(1)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options); + ASSERT_EQ(2, TotalLiveFiles(1)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + + // Full compaction to DB path 0 + compact_options.target_path_id = 0; + compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; + db_->CompactRange(compact_options, handles_[1], nullptr, nullptr); + ASSERT_EQ(1, TotalLiveFiles(1)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[1].path)); + + // Fail when compacting to an invalid path ID + compact_options.target_path_id = 2; + compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; + ASSERT_TRUE(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr) + .IsInvalidArgument()); +} + +INSTANTIATE_TEST_CASE_P(OutputPathId, + DBTestUniversalManualCompactionOutputPathId, + ::testing::Combine(::testing::Values(1, 8), + ::testing::Bool())); + +TEST_F(DBTestUniversalCompaction2, BasicL0toL1) { + const int kNumKeys = 3000; + const int kWindowSize = 100; + const int kNumDelsTrigger = 90; + + Options opts = CurrentOptions(); + opts.table_properties_collector_factories.emplace_back( + NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger)); + opts.compaction_style = kCompactionStyleUniversal; + opts.level0_file_num_compaction_trigger = 2; + opts.compression = kNoCompression; + opts.compaction_options_universal.size_ratio = 10; + opts.compaction_options_universal.min_merge_width = 2; + opts.compaction_options_universal.max_size_amplification_percent = 200; + Reopen(opts); + + // add an L1 file to prevent tombstones from dropping due to obsolescence + // during flush + int i; + for (i = 0; i < 2000; ++i) { + Put(Key(i), "val"); + } + Flush(); + // MoveFilesToLevel(6); + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + + for (i = 1999; i < kNumKeys; ++i) { + if (i >= kNumKeys - kWindowSize && + i < kNumKeys - kWindowSize + kNumDelsTrigger) { + Delete(Key(i)); + } else { + Put(Key(i), "val"); + } + } + Flush(); + + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_GT(NumTableFilesAtLevel(6), 0); +} + +TEST_F(DBTestUniversalCompaction2, SingleLevel) { + const int kNumKeys = 3000; + const int kWindowSize = 100; + const int kNumDelsTrigger = 90; + + Options opts = CurrentOptions(); + opts.table_properties_collector_factories.emplace_back( + NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger)); + opts.compaction_style = kCompactionStyleUniversal; + opts.level0_file_num_compaction_trigger = 2; + opts.compression = kNoCompression; + opts.num_levels = 1; + opts.compaction_options_universal.size_ratio = 10; + opts.compaction_options_universal.min_merge_width = 2; + opts.compaction_options_universal.max_size_amplification_percent = 200; + Reopen(opts); + + // add an L1 file to prevent tombstones from dropping due to obsolescence + // during flush + int i; + for (i = 0; i < 2000; ++i) { + Put(Key(i), "val"); + } + Flush(); + + for (i = 1999; i < kNumKeys; ++i) { + if (i >= kNumKeys - kWindowSize && + i < kNumKeys - kWindowSize + kNumDelsTrigger) { + Delete(Key(i)); + } else { + Put(Key(i), "val"); + } + } + Flush(); + + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); +} + +TEST_F(DBTestUniversalCompaction2, MultipleLevels) { + const int kWindowSize = 100; + const int kNumDelsTrigger = 90; + + Options opts = CurrentOptions(); + opts.table_properties_collector_factories.emplace_back( + NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger)); + opts.compaction_style = kCompactionStyleUniversal; + opts.level0_file_num_compaction_trigger = 4; + opts.compression = kNoCompression; + opts.compaction_options_universal.size_ratio = 10; + opts.compaction_options_universal.min_merge_width = 2; + opts.compaction_options_universal.max_size_amplification_percent = 200; + Reopen(opts); + + // add an L1 file to prevent tombstones from dropping due to obsolescence + // during flush + int i; + for (i = 0; i < 500; ++i) { + Put(Key(i), "val"); + } + Flush(); + for (i = 500; i < 1000; ++i) { + Put(Key(i), "val"); + } + Flush(); + for (i = 1000; i < 1500; ++i) { + Put(Key(i), "val"); + } + Flush(); + for (i = 1500; i < 2000; ++i) { + Put(Key(i), "val"); + } + Flush(); + + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_GT(NumTableFilesAtLevel(6), 0); + + for (i = 1999; i < 2333; ++i) { + Put(Key(i), "val"); + } + Flush(); + for (i = 2333; i < 2666; ++i) { + Put(Key(i), "val"); + } + Flush(); + for (i = 2666; i < 2999; ++i) { + Put(Key(i), "val"); + } + Flush(); + + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_GT(NumTableFilesAtLevel(6), 0); + ASSERT_GT(NumTableFilesAtLevel(5), 0); + + for (i = 1900; i < 2100; ++i) { + Delete(Key(i)); + } + Flush(); + + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_EQ(0, NumTableFilesAtLevel(1)); + ASSERT_EQ(0, NumTableFilesAtLevel(2)); + ASSERT_EQ(0, NumTableFilesAtLevel(3)); + ASSERT_EQ(0, NumTableFilesAtLevel(4)); + ASSERT_EQ(0, NumTableFilesAtLevel(5)); + ASSERT_GT(NumTableFilesAtLevel(6), 0); +} + +TEST_F(DBTestUniversalCompaction2, OverlappingL0) { + const int kWindowSize = 100; + const int kNumDelsTrigger = 90; + + Options opts = CurrentOptions(); + opts.table_properties_collector_factories.emplace_back( + NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger)); + opts.compaction_style = kCompactionStyleUniversal; + opts.level0_file_num_compaction_trigger = 5; + opts.compression = kNoCompression; + opts.compaction_options_universal.size_ratio = 10; + opts.compaction_options_universal.min_merge_width = 2; + opts.compaction_options_universal.max_size_amplification_percent = 200; + Reopen(opts); + + // add an L1 file to prevent tombstones from dropping due to obsolescence + // during flush + int i; + for (i = 0; i < 2000; ++i) { + Put(Key(i), "val"); + } + Flush(); + for (i = 2000; i < 3000; ++i) { + Put(Key(i), "val"); + } + Flush(); + for (i = 3500; i < 4000; ++i) { + Put(Key(i), "val"); + } + Flush(); + for (i = 2900; i < 3100; ++i) { + Delete(Key(i)); + } + Flush(); + + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(2, NumTableFilesAtLevel(0)); + ASSERT_GT(NumTableFilesAtLevel(6), 0); +} + +TEST_F(DBTestUniversalCompaction2, IngestBehind) { + const int kNumKeys = 3000; + const int kWindowSize = 100; + const int kNumDelsTrigger = 90; + + Options opts = CurrentOptions(); + opts.table_properties_collector_factories.emplace_back( + NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger)); + opts.compaction_style = kCompactionStyleUniversal; + opts.level0_file_num_compaction_trigger = 2; + opts.compression = kNoCompression; + opts.allow_ingest_behind = true; + opts.compaction_options_universal.size_ratio = 10; + opts.compaction_options_universal.min_merge_width = 2; + opts.compaction_options_universal.max_size_amplification_percent = 200; + Reopen(opts); + + // add an L1 file to prevent tombstones from dropping due to obsolescence + // during flush + int i; + for (i = 0; i < 2000; ++i) { + Put(Key(i), "val"); + } + Flush(); + // MoveFilesToLevel(6); + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + + for (i = 1999; i < kNumKeys; ++i) { + if (i >= kNumKeys - kWindowSize && + i < kNumKeys - kWindowSize + kNumDelsTrigger) { + Delete(Key(i)); + } else { + Put(Key(i), "val"); + } + } + Flush(); + + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_EQ(0, NumTableFilesAtLevel(6)); + ASSERT_GT(NumTableFilesAtLevel(5), 0); +} + +TEST_F(DBTestUniversalCompaction2, PeriodicCompactionDefault) { + Options options; + options.compaction_style = kCompactionStyleUniversal; + + KeepFilterFactory* filter = new KeepFilterFactory(true); + options.compaction_filter_factory.reset(filter); + Reopen(options); + ASSERT_EQ(30 * 24 * 60 * 60, + dbfull()->GetOptions().periodic_compaction_seconds); + + KeepFilter df; + options.compaction_filter_factory.reset(); + options.compaction_filter = &df; + Reopen(options); + ASSERT_EQ(30 * 24 * 60 * 60, + dbfull()->GetOptions().periodic_compaction_seconds); + + options.ttl = 60 * 24 * 60 * 60; + options.compaction_filter = nullptr; + Reopen(options); + ASSERT_EQ(60 * 24 * 60 * 60, + dbfull()->GetOptions().periodic_compaction_seconds); +} + +TEST_F(DBTestUniversalCompaction2, PeriodicCompaction) { + Options opts = CurrentOptions(); + opts.env = env_; + opts.compaction_style = kCompactionStyleUniversal; + opts.level0_file_num_compaction_trigger = 10; + opts.max_open_files = -1; + opts.compaction_options_universal.size_ratio = 10; + opts.compaction_options_universal.min_merge_width = 2; + opts.compaction_options_universal.max_size_amplification_percent = 200; + opts.periodic_compaction_seconds = 48 * 60 * 60; // 2 days + opts.num_levels = 5; + env_->addon_time_.store(0); + Reopen(opts); + + int periodic_compactions = 0; + int start_level = -1; + int output_level = -1; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "UniversalCompactionPicker::PickPeriodicCompaction:Return", + [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + ASSERT_TRUE(arg != nullptr); + ASSERT_TRUE(compaction->compaction_reason() == + CompactionReason::kPeriodicCompaction); + start_level = compaction->start_level(); + output_level = compaction->output_level(); + periodic_compactions++; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Case 1: Oldest flushed file excceeds periodic compaction threshold. + ASSERT_OK(Put("foo", "bar")); + Flush(); + ASSERT_EQ(0, periodic_compactions); + // Move clock forward so that the flushed file would qualify periodic + // compaction. + env_->addon_time_.store(48 * 60 * 60 + 100); + + // Another flush would trigger compaction the oldest file. + ASSERT_OK(Put("foo", "bar2")); + Flush(); + dbfull()->TEST_WaitForCompact(); + + ASSERT_EQ(1, periodic_compactions); + ASSERT_EQ(0, start_level); + ASSERT_EQ(4, output_level); + + // Case 2: Oldest compacted file excceeds periodic compaction threshold + periodic_compactions = 0; + // A flush doesn't trigger a periodic compaction when threshold not hit + ASSERT_OK(Put("foo", "bar2")); + Flush(); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(0, periodic_compactions); + + // After periodic compaction threshold hits, a flush will trigger + // a compaction + ASSERT_OK(Put("foo", "bar2")); + env_->addon_time_.fetch_add(48 * 60 * 60 + 100); + Flush(); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(1, periodic_compactions); + ASSERT_EQ(0, start_level); + ASSERT_EQ(4, output_level); +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // !defined(ROCKSDB_LITE) + +int main(int argc, char** argv) { +#if !defined(ROCKSDB_LITE) + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +#else + (void) argc; + (void) argv; + return 0; +#endif +} diff --git a/src/rocksdb/db/db_wal_test.cc b/src/rocksdb/db/db_wal_test.cc new file mode 100644 index 000000000..ef81de803 --- /dev/null +++ b/src/rocksdb/db/db_wal_test.cc @@ -0,0 +1,1586 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_test_util.h" +#include "env/composite_env_wrapper.h" +#include "options/options_helper.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "test_util/fault_injection_test_env.h" +#include "test_util/sync_point.h" + +namespace ROCKSDB_NAMESPACE { +class DBWALTest : public DBTestBase { + public: + DBWALTest() : DBTestBase("/db_wal_test") {} + +#if defined(ROCKSDB_PLATFORM_POSIX) + uint64_t GetAllocatedFileSize(std::string file_name) { + struct stat sbuf; + int err = stat(file_name.c_str(), &sbuf); + assert(err == 0); + return sbuf.st_blocks * 512; + } +#endif +}; + +// A SpecialEnv enriched to give more insight about deleted files +class EnrichedSpecialEnv : public SpecialEnv { + public: + explicit EnrichedSpecialEnv(Env* base) : SpecialEnv(base) {} + Status NewSequentialFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& soptions) override { + InstrumentedMutexLock l(&env_mutex_); + if (f == skipped_wal) { + deleted_wal_reopened = true; + if (IsWAL(f) && largetest_deleted_wal.size() != 0 && + f.compare(largetest_deleted_wal) <= 0) { + gap_in_wals = true; + } + } + return SpecialEnv::NewSequentialFile(f, r, soptions); + } + Status DeleteFile(const std::string& fname) override { + if (IsWAL(fname)) { + deleted_wal_cnt++; + InstrumentedMutexLock l(&env_mutex_); + // If this is the first WAL, remember its name and skip deleting it. We + // remember its name partly because the application might attempt to + // delete the file again. + if (skipped_wal.size() != 0 && skipped_wal != fname) { + if (largetest_deleted_wal.size() == 0 || + largetest_deleted_wal.compare(fname) < 0) { + largetest_deleted_wal = fname; + } + } else { + skipped_wal = fname; + return Status::OK(); + } + } + return SpecialEnv::DeleteFile(fname); + } + bool IsWAL(const std::string& fname) { + // printf("iswal %s\n", fname.c_str()); + return fname.compare(fname.size() - 3, 3, "log") == 0; + } + + InstrumentedMutex env_mutex_; + // the wal whose actual delete was skipped by the env + std::string skipped_wal = ""; + // the largest WAL that was requested to be deleted + std::string largetest_deleted_wal = ""; + // number of WALs that were successfully deleted + std::atomic deleted_wal_cnt = {0}; + // the WAL whose delete from fs was skipped is reopened during recovery + std::atomic deleted_wal_reopened = {false}; + // whether a gap in the WALs was detected during recovery + std::atomic gap_in_wals = {false}; +}; + +class DBWALTestWithEnrichedEnv : public DBTestBase { + public: + DBWALTestWithEnrichedEnv() : DBTestBase("/db_wal_test") { + enriched_env_ = new EnrichedSpecialEnv(env_->target()); + auto options = CurrentOptions(); + options.env = enriched_env_; + options.allow_2pc = true; + Reopen(options); + delete env_; + // to be deleted by the parent class + env_ = enriched_env_; + } + + protected: + EnrichedSpecialEnv* enriched_env_; +}; + +// Test that the recovery would successfully avoid the gaps between the logs. +// One known scenario that could cause this is that the application issue the +// WAL deletion out of order. For the sake of simplicity in the test, here we +// create the gap by manipulating the env to skip deletion of the first WAL but +// not the ones after it. +TEST_F(DBWALTestWithEnrichedEnv, SkipDeletedWALs) { + auto options = last_options_; + // To cause frequent WAL deletion + options.write_buffer_size = 128; + Reopen(options); + + WriteOptions writeOpt = WriteOptions(); + for (int i = 0; i < 128 * 5; i++) { + ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v1")); + } + FlushOptions fo; + fo.wait = true; + ASSERT_OK(db_->Flush(fo)); + + // some wals are deleted + ASSERT_NE(0, enriched_env_->deleted_wal_cnt); + // but not the first one + ASSERT_NE(0, enriched_env_->skipped_wal.size()); + + // Test that the WAL that was not deleted will be skipped during recovery + options = last_options_; + Reopen(options); + ASSERT_FALSE(enriched_env_->deleted_wal_reopened); + ASSERT_FALSE(enriched_env_->gap_in_wals); +} + +TEST_F(DBWALTest, WAL) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1")); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); + + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v1", Get(1, "bar")); + + writeOpt.disableWAL = false; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2")); + writeOpt.disableWAL = true; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2")); + + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + // Both value's should be present. + ASSERT_EQ("v2", Get(1, "bar")); + ASSERT_EQ("v2", Get(1, "foo")); + + writeOpt.disableWAL = true; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3")); + writeOpt.disableWAL = false; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3")); + + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + // again both values should be present. + ASSERT_EQ("v3", Get(1, "foo")); + ASSERT_EQ("v3", Get(1, "bar")); + } while (ChangeWalOptions()); +} + +TEST_F(DBWALTest, RollLog) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Put(1, "baz", "v5")); + + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + for (int i = 0; i < 10; i++) { + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + } + ASSERT_OK(Put(1, "foo", "v4")); + for (int i = 0; i < 10; i++) { + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + } + } while (ChangeWalOptions()); +} + +TEST_F(DBWALTest, SyncWALNotBlockWrite) { + Options options = CurrentOptions(); + options.max_write_buffer_number = 4; + DestroyAndReopen(options); + + ASSERT_OK(Put("foo1", "bar1")); + ASSERT_OK(Put("foo5", "bar5")); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"WritableFileWriter::SyncWithoutFlush:1", + "DBWALTest::SyncWALNotBlockWrite:1"}, + {"DBWALTest::SyncWALNotBlockWrite:2", + "WritableFileWriter::SyncWithoutFlush:2"}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::port::Thread thread([&]() { ASSERT_OK(db_->SyncWAL()); }); + + TEST_SYNC_POINT("DBWALTest::SyncWALNotBlockWrite:1"); + ASSERT_OK(Put("foo2", "bar2")); + ASSERT_OK(Put("foo3", "bar3")); + FlushOptions fo; + fo.wait = false; + ASSERT_OK(db_->Flush(fo)); + ASSERT_OK(Put("foo4", "bar4")); + + TEST_SYNC_POINT("DBWALTest::SyncWALNotBlockWrite:2"); + + thread.join(); + + ASSERT_EQ(Get("foo1"), "bar1"); + ASSERT_EQ(Get("foo2"), "bar2"); + ASSERT_EQ(Get("foo3"), "bar3"); + ASSERT_EQ(Get("foo4"), "bar4"); + ASSERT_EQ(Get("foo5"), "bar5"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBWALTest, SyncWALNotWaitWrite) { + ASSERT_OK(Put("foo1", "bar1")); + ASSERT_OK(Put("foo3", "bar3")); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"SpecialEnv::WalFile::Append:1", "DBWALTest::SyncWALNotWaitWrite:1"}, + {"DBWALTest::SyncWALNotWaitWrite:2", "SpecialEnv::WalFile::Append:2"}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::port::Thread thread( + [&]() { ASSERT_OK(Put("foo2", "bar2")); }); + // Moving this to SyncWAL before the actual fsync + // TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:1"); + ASSERT_OK(db_->SyncWAL()); + // Moving this to SyncWAL after actual fsync + // TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:2"); + + thread.join(); + + ASSERT_EQ(Get("foo1"), "bar1"); + ASSERT_EQ(Get("foo2"), "bar2"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBWALTest, Recover) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Put(1, "baz", "v5")); + + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_EQ("v1", Get(1, "foo")); + + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v5", Get(1, "baz")); + ASSERT_OK(Put(1, "bar", "v2")); + ASSERT_OK(Put(1, "foo", "v3")); + + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_EQ("v3", Get(1, "foo")); + ASSERT_OK(Put(1, "foo", "v4")); + ASSERT_EQ("v4", Get(1, "foo")); + ASSERT_EQ("v2", Get(1, "bar")); + ASSERT_EQ("v5", Get(1, "baz")); + } while (ChangeWalOptions()); +} + +TEST_F(DBWALTest, RecoverWithTableHandle) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.avoid_flush_during_recovery = false; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Put(1, "bar", "v2")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "foo", "v3")); + ASSERT_OK(Put(1, "bar", "v4")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "big", std::string(100, 'a'))); + + options = CurrentOptions(); + const int kSmallMaxOpenFiles = 13; + if (option_config_ == kDBLogDir) { + // Use this option to check not preloading files + // Set the max open files to be small enough so no preload will + // happen. + options.max_open_files = kSmallMaxOpenFiles; + // RocksDB sanitize max open files to at least 20. Modify it back. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) { + int* max_open_files = static_cast(arg); + *max_open_files = kSmallMaxOpenFiles; + }); + + } else if (option_config_ == kWalDirAndMmapReads) { + // Use this option to check always loading all files. + options.max_open_files = 100; + } else { + options.max_open_files = -1; + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + + std::vector> files; + dbfull()->TEST_GetFilesMetaData(handles_[1], &files); + size_t total_files = 0; + for (const auto& level : files) { + total_files += level.size(); + } + ASSERT_EQ(total_files, 3); + for (const auto& level : files) { + for (const auto& file : level) { + if (options.max_open_files == kSmallMaxOpenFiles) { + ASSERT_TRUE(file.table_reader_handle == nullptr); + } else { + ASSERT_TRUE(file.table_reader_handle != nullptr); + } + } + } + } while (ChangeWalOptions()); +} + +TEST_F(DBWALTest, IgnoreRecoveredLog) { + std::string backup_logs = dbname_ + "/backup_logs"; + + do { + // delete old files in backup_logs directory + env_->CreateDirIfMissing(backup_logs); + std::vector old_files; + env_->GetChildren(backup_logs, &old_files); + for (auto& file : old_files) { + if (file != "." && file != "..") { + env_->DeleteFile(backup_logs + "/" + file); + } + } + Options options = CurrentOptions(); + options.create_if_missing = true; + options.merge_operator = MergeOperators::CreateUInt64AddOperator(); + options.wal_dir = dbname_ + "/logs"; + DestroyAndReopen(options); + + // fill up the DB + std::string one, two; + PutFixed64(&one, 1); + PutFixed64(&two, 2); + ASSERT_OK(db_->Merge(WriteOptions(), Slice("foo"), Slice(one))); + ASSERT_OK(db_->Merge(WriteOptions(), Slice("foo"), Slice(one))); + ASSERT_OK(db_->Merge(WriteOptions(), Slice("bar"), Slice(one))); + + // copy the logs to backup + std::vector logs; + env_->GetChildren(options.wal_dir, &logs); + for (auto& log : logs) { + if (log != ".." && log != ".") { + CopyFile(options.wal_dir + "/" + log, backup_logs + "/" + log); + } + } + + // recover the DB + Reopen(options); + ASSERT_EQ(two, Get("foo")); + ASSERT_EQ(one, Get("bar")); + Close(); + + // copy the logs from backup back to wal dir + for (auto& log : logs) { + if (log != ".." && log != ".") { + CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log); + } + } + // this should ignore the log files, recovery should not happen again + // if the recovery happens, the same merge operator would be called twice, + // leading to incorrect results + Reopen(options); + ASSERT_EQ(two, Get("foo")); + ASSERT_EQ(one, Get("bar")); + Close(); + Destroy(options); + Reopen(options); + Close(); + + // copy the logs from backup back to wal dir + env_->CreateDirIfMissing(options.wal_dir); + for (auto& log : logs) { + if (log != ".." && log != ".") { + CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log); + } + } + // assert that we successfully recovered only from logs, even though we + // destroyed the DB + Reopen(options); + ASSERT_EQ(two, Get("foo")); + ASSERT_EQ(one, Get("bar")); + + // Recovery will fail if DB directory doesn't exist. + Destroy(options); + // copy the logs from backup back to wal dir + env_->CreateDirIfMissing(options.wal_dir); + for (auto& log : logs) { + if (log != ".." && log != ".") { + CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log); + // we won't be needing this file no more + env_->DeleteFile(backup_logs + "/" + log); + } + } + Status s = TryReopen(options); + ASSERT_TRUE(!s.ok()); + Destroy(options); + } while (ChangeWalOptions()); +} + +TEST_F(DBWALTest, RecoveryWithEmptyLog) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Put(1, "foo", "v2")); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "foo", "v3")); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ASSERT_EQ("v3", Get(1, "foo")); + } while (ChangeWalOptions()); +} + +#if !(defined NDEBUG) || !defined(OS_WIN) +TEST_F(DBWALTest, PreallocateBlock) { + Options options = CurrentOptions(); + options.write_buffer_size = 10 * 1000 * 1000; + options.max_total_wal_size = 0; + + size_t expected_preallocation_size = static_cast( + options.write_buffer_size + options.write_buffer_size / 10); + + DestroyAndReopen(options); + + std::atomic called(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBTestWalFile.GetPreallocationStatus", [&](void* arg) { + ASSERT_TRUE(arg != nullptr); + size_t preallocation_size = *(static_cast(arg)); + ASSERT_EQ(expected_preallocation_size, preallocation_size); + called.fetch_add(1); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + Put("", ""); + Flush(); + Put("", ""); + Close(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ASSERT_EQ(2, called.load()); + + options.max_total_wal_size = 1000 * 1000; + expected_preallocation_size = static_cast(options.max_total_wal_size); + Reopen(options); + called.store(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBTestWalFile.GetPreallocationStatus", [&](void* arg) { + ASSERT_TRUE(arg != nullptr); + size_t preallocation_size = *(static_cast(arg)); + ASSERT_EQ(expected_preallocation_size, preallocation_size); + called.fetch_add(1); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + Put("", ""); + Flush(); + Put("", ""); + Close(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ASSERT_EQ(2, called.load()); + + options.db_write_buffer_size = 800 * 1000; + expected_preallocation_size = + static_cast(options.db_write_buffer_size); + Reopen(options); + called.store(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBTestWalFile.GetPreallocationStatus", [&](void* arg) { + ASSERT_TRUE(arg != nullptr); + size_t preallocation_size = *(static_cast(arg)); + ASSERT_EQ(expected_preallocation_size, preallocation_size); + called.fetch_add(1); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + Put("", ""); + Flush(); + Put("", ""); + Close(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ASSERT_EQ(2, called.load()); + + expected_preallocation_size = 700 * 1000; + std::shared_ptr write_buffer_manager = + std::make_shared(static_cast(700 * 1000)); + options.write_buffer_manager = write_buffer_manager; + Reopen(options); + called.store(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBTestWalFile.GetPreallocationStatus", [&](void* arg) { + ASSERT_TRUE(arg != nullptr); + size_t preallocation_size = *(static_cast(arg)); + ASSERT_EQ(expected_preallocation_size, preallocation_size); + called.fetch_add(1); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + Put("", ""); + Flush(); + Put("", ""); + Close(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ASSERT_EQ(2, called.load()); +} +#endif // !(defined NDEBUG) || !defined(OS_WIN) + +#ifndef ROCKSDB_LITE +TEST_F(DBWALTest, FullPurgePreservesRecycledLog) { + // For github issue #1303 + for (int i = 0; i < 2; ++i) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.recycle_log_file_num = 2; + if (i != 0) { + options.wal_dir = alternative_wal_dir_; + } + + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "v1")); + VectorLogPtr log_files; + ASSERT_OK(dbfull()->GetSortedWalFiles(log_files)); + ASSERT_GT(log_files.size(), 0); + ASSERT_OK(Flush()); + + // Now the original WAL is in log_files[0] and should be marked for + // recycling. + // Verify full purge cannot remove this file. + JobContext job_context(0); + dbfull()->TEST_LockMutex(); + dbfull()->FindObsoleteFiles(&job_context, true /* force */); + dbfull()->TEST_UnlockMutex(); + dbfull()->PurgeObsoleteFiles(job_context); + + if (i == 0) { + ASSERT_OK( + env_->FileExists(LogFileName(dbname_, log_files[0]->LogNumber()))); + } else { + ASSERT_OK(env_->FileExists( + LogFileName(alternative_wal_dir_, log_files[0]->LogNumber()))); + } + } +} + +TEST_F(DBWALTest, FullPurgePreservesLogPendingReuse) { + // Ensures full purge cannot delete a WAL while it's in the process of being + // recycled. In particular, we force the full purge after a file has been + // chosen for reuse, but before it has been renamed. + for (int i = 0; i < 2; ++i) { + Options options = CurrentOptions(); + options.recycle_log_file_num = 1; + if (i != 0) { + options.wal_dir = alternative_wal_dir_; + } + DestroyAndReopen(options); + + // The first flush creates a second log so writes can continue before the + // flush finishes. + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + + // The second flush can recycle the first log. Sync points enforce the + // full purge happens after choosing the log to recycle and before it is + // renamed. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"DBImpl::CreateWAL:BeforeReuseWritableFile1", + "DBWALTest::FullPurgePreservesLogPendingReuse:PreFullPurge"}, + {"DBWALTest::FullPurgePreservesLogPendingReuse:PostFullPurge", + "DBImpl::CreateWAL:BeforeReuseWritableFile2"}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ROCKSDB_NAMESPACE::port::Thread thread([&]() { + TEST_SYNC_POINT( + "DBWALTest::FullPurgePreservesLogPendingReuse:PreFullPurge"); + ASSERT_OK(db_->EnableFileDeletions(true)); + TEST_SYNC_POINT( + "DBWALTest::FullPurgePreservesLogPendingReuse:PostFullPurge"); + }); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + thread.join(); + } +} + +TEST_F(DBWALTest, GetSortedWalFiles) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + VectorLogPtr log_files; + ASSERT_OK(dbfull()->GetSortedWalFiles(log_files)); + ASSERT_EQ(0, log_files.size()); + + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(dbfull()->GetSortedWalFiles(log_files)); + ASSERT_EQ(1, log_files.size()); + } while (ChangeWalOptions()); +} + +TEST_F(DBWALTest, GetCurrentWalFile) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + + std::unique_ptr* bad_log_file = nullptr; + ASSERT_NOK(dbfull()->GetCurrentWalFile(bad_log_file)); + + std::unique_ptr log_file; + ASSERT_OK(dbfull()->GetCurrentWalFile(&log_file)); + + // nothing has been written to the log yet + ASSERT_EQ(log_file->StartSequence(), 0); + ASSERT_EQ(log_file->SizeFileBytes(), 0); + ASSERT_EQ(log_file->Type(), kAliveLogFile); + ASSERT_GT(log_file->LogNumber(), 0); + + // add some data and verify that the file size actually moves foward + ASSERT_OK(Put(0, "foo", "v1")); + ASSERT_OK(Put(0, "foo2", "v2")); + ASSERT_OK(Put(0, "foo3", "v3")); + + ASSERT_OK(dbfull()->GetCurrentWalFile(&log_file)); + + ASSERT_EQ(log_file->StartSequence(), 0); + ASSERT_GT(log_file->SizeFileBytes(), 0); + ASSERT_EQ(log_file->Type(), kAliveLogFile); + ASSERT_GT(log_file->LogNumber(), 0); + + // force log files to cycle and add some more data, then check if + // log number moves forward + + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + for (int i = 0; i < 10; i++) { + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + } + + ASSERT_OK(Put(0, "foo4", "v4")); + ASSERT_OK(Put(0, "foo5", "v5")); + ASSERT_OK(Put(0, "foo6", "v6")); + + ASSERT_OK(dbfull()->GetCurrentWalFile(&log_file)); + + ASSERT_EQ(log_file->StartSequence(), 0); + ASSERT_GT(log_file->SizeFileBytes(), 0); + ASSERT_EQ(log_file->Type(), kAliveLogFile); + ASSERT_GT(log_file->LogNumber(), 0); + + } while (ChangeWalOptions()); +} + +TEST_F(DBWALTest, RecoveryWithLogDataForSomeCFs) { + // Test for regression of WAL cleanup missing files that don't contain data + // for every column family. + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Put(1, "foo", "v2")); + uint64_t earliest_log_nums[2]; + for (int i = 0; i < 2; ++i) { + if (i > 0) { + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + } + VectorLogPtr log_files; + ASSERT_OK(dbfull()->GetSortedWalFiles(log_files)); + if (log_files.size() > 0) { + earliest_log_nums[i] = log_files[0]->LogNumber(); + } else { + earliest_log_nums[i] = port::kMaxUint64; + } + } + // Check at least the first WAL was cleaned up during the recovery. + ASSERT_LT(earliest_log_nums[0], earliest_log_nums[1]); + } while (ChangeWalOptions()); +} + +TEST_F(DBWALTest, RecoverWithLargeLog) { + do { + { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_OK(Put(1, "big1", std::string(200000, '1'))); + ASSERT_OK(Put(1, "big2", std::string(200000, '2'))); + ASSERT_OK(Put(1, "small3", std::string(10, '3'))); + ASSERT_OK(Put(1, "small4", std::string(10, '4'))); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); + } + + // Make sure that if we re-open with a small write buffer size that + // we flush table files in the middle of a large log file. + Options options; + options.write_buffer_size = 100000; + options = CurrentOptions(options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ(NumTableFilesAtLevel(0, 1), 3); + ASSERT_EQ(std::string(200000, '1'), Get(1, "big1")); + ASSERT_EQ(std::string(200000, '2'), Get(1, "big2")); + ASSERT_EQ(std::string(10, '3'), Get(1, "small3")); + ASSERT_EQ(std::string(10, '4'), Get(1, "small4")); + ASSERT_GT(NumTableFilesAtLevel(0, 1), 1); + } while (ChangeWalOptions()); +} + +// In https://reviews.facebook.net/D20661 we change +// recovery behavior: previously for each log file each column family +// memtable was flushed, even it was empty. Now it's changed: +// we try to create the smallest number of table files by merging +// updates from multiple logs +TEST_F(DBWALTest, RecoverCheckFileAmountWithSmallWriteBuffer) { + Options options = CurrentOptions(); + options.write_buffer_size = 5000000; + CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options); + + // Since we will reopen DB with smaller write_buffer_size, + // each key will go to new SST file + ASSERT_OK(Put(1, Key(10), DummyString(1000000))); + ASSERT_OK(Put(1, Key(10), DummyString(1000000))); + ASSERT_OK(Put(1, Key(10), DummyString(1000000))); + ASSERT_OK(Put(1, Key(10), DummyString(1000000))); + + ASSERT_OK(Put(3, Key(10), DummyString(1))); + // Make 'dobrynia' to be flushed and new WAL file to be created + ASSERT_OK(Put(2, Key(10), DummyString(7500000))); + ASSERT_OK(Put(2, Key(1), DummyString(1))); + dbfull()->TEST_WaitForFlushMemTable(handles_[2]); + { + auto tables = ListTableFiles(env_, dbname_); + ASSERT_EQ(tables.size(), static_cast(1)); + // Make sure 'dobrynia' was flushed: check sst files amount + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(1)); + } + // New WAL file + ASSERT_OK(Put(1, Key(1), DummyString(1))); + ASSERT_OK(Put(1, Key(1), DummyString(1))); + ASSERT_OK(Put(3, Key(10), DummyString(1))); + ASSERT_OK(Put(3, Key(10), DummyString(1))); + ASSERT_OK(Put(3, Key(10), DummyString(1))); + + options.write_buffer_size = 4096; + options.arena_block_size = 4096; + ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"}, + options); + { + // No inserts => default is empty + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(0)); + // First 4 keys goes to separate SSTs + 1 more SST for 2 smaller keys + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(5)); + // 1 SST for big key + 1 SST for small one + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(2)); + // 1 SST for all keys + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(1)); + } +} + +// In https://reviews.facebook.net/D20661 we change +// recovery behavior: previously for each log file each column family +// memtable was flushed, even it wasn't empty. Now it's changed: +// we try to create the smallest number of table files by merging +// updates from multiple logs +TEST_F(DBWALTest, RecoverCheckFileAmount) { + Options options = CurrentOptions(); + options.write_buffer_size = 100000; + options.arena_block_size = 4 * 1024; + options.avoid_flush_during_recovery = false; + CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options); + + ASSERT_OK(Put(0, Key(1), DummyString(1))); + ASSERT_OK(Put(1, Key(1), DummyString(1))); + ASSERT_OK(Put(2, Key(1), DummyString(1))); + + // Make 'nikitich' memtable to be flushed + ASSERT_OK(Put(3, Key(10), DummyString(1002400))); + ASSERT_OK(Put(3, Key(1), DummyString(1))); + dbfull()->TEST_WaitForFlushMemTable(handles_[3]); + // 4 memtable are not flushed, 1 sst file + { + auto tables = ListTableFiles(env_, dbname_); + ASSERT_EQ(tables.size(), static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(1)); + } + // Memtable for 'nikitich' has flushed, new WAL file has opened + // 4 memtable still not flushed + + // Write to new WAL file + ASSERT_OK(Put(0, Key(1), DummyString(1))); + ASSERT_OK(Put(1, Key(1), DummyString(1))); + ASSERT_OK(Put(2, Key(1), DummyString(1))); + + // Fill up 'nikitich' one more time + ASSERT_OK(Put(3, Key(10), DummyString(1002400))); + // make it flush + ASSERT_OK(Put(3, Key(1), DummyString(1))); + dbfull()->TEST_WaitForFlushMemTable(handles_[3]); + // There are still 4 memtable not flushed, and 2 sst tables + ASSERT_OK(Put(0, Key(1), DummyString(1))); + ASSERT_OK(Put(1, Key(1), DummyString(1))); + ASSERT_OK(Put(2, Key(1), DummyString(1))); + + { + auto tables = ListTableFiles(env_, dbname_); + ASSERT_EQ(tables.size(), static_cast(2)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(2)); + } + + ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"}, + options); + { + std::vector table_files = ListTableFiles(env_, dbname_); + // Check, that records for 'default', 'dobrynia' and 'pikachu' from + // first, second and third WALs went to the same SST. + // So, there is 6 SSTs: three for 'nikitich', one for 'default', one for + // 'dobrynia', one for 'pikachu' + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(3)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(1)); + } +} + +TEST_F(DBWALTest, SyncMultipleLogs) { + const uint64_t kNumBatches = 2; + const int kBatchSize = 1000; + + Options options = CurrentOptions(); + options.create_if_missing = true; + options.write_buffer_size = 4096; + Reopen(options); + + WriteBatch batch; + WriteOptions wo; + wo.sync = true; + + for (uint64_t b = 0; b < kNumBatches; b++) { + batch.Clear(); + for (int i = 0; i < kBatchSize; i++) { + batch.Put(Key(i), DummyString(128)); + } + + dbfull()->Write(wo, &batch); + } + + ASSERT_OK(dbfull()->SyncWAL()); +} + +// Github issue 1339. Prior the fix we read sequence id from the first log to +// a local variable, then keep increase the variable as we replay logs, +// ignoring actual sequence id of the records. This is incorrect if some writes +// come with WAL disabled. +TEST_F(DBWALTest, PartOfWritesWithWALDisabled) { + std::unique_ptr fault_env( + new FaultInjectionTestEnv(env_)); + Options options = CurrentOptions(); + options.env = fault_env.get(); + options.disable_auto_compactions = true; + WriteOptions wal_on, wal_off; + wal_on.sync = true; + wal_on.disableWAL = false; + wal_off.disableWAL = true; + CreateAndReopenWithCF({"dummy"}, options); + ASSERT_OK(Put(1, "dummy", "d1", wal_on)); // seq id 1 + ASSERT_OK(Put(1, "dummy", "d2", wal_off)); + ASSERT_OK(Put(1, "dummy", "d3", wal_off)); + ASSERT_OK(Put(0, "key", "v4", wal_on)); // seq id 4 + ASSERT_OK(Flush(0)); + ASSERT_OK(Put(0, "key", "v5", wal_on)); // seq id 5 + ASSERT_EQ("v5", Get(0, "key")); + dbfull()->FlushWAL(false); + // Simulate a crash. + fault_env->SetFilesystemActive(false); + Close(); + fault_env->ResetState(); + ReopenWithColumnFamilies({"default", "dummy"}, options); + // Prior to the fix, we may incorrectly recover "v5" with sequence id = 3. + ASSERT_EQ("v5", Get(0, "key")); + // Destroy DB before destruct fault_env. + Destroy(options); +} + +// +// Test WAL recovery for the various modes available +// +class RecoveryTestHelper { + public: + // Number of WAL files to generate + static const int kWALFilesCount = 10; + // Starting number for the WAL file name like 00010.log + static const int kWALFileOffset = 10; + // Keys to be written per WAL file + static const int kKeysPerWALFile = 133; + // Size of the value + static const int kValueSize = 96; + + // Create WAL files with values filled in + static void FillData(DBWALTest* test, const Options& options, + const size_t wal_count, size_t* count) { + // Calling internal functions requires sanitized options. + Options sanitized_options = SanitizeOptions(test->dbname_, options); + const ImmutableDBOptions db_options(sanitized_options); + + *count = 0; + + std::shared_ptr table_cache = NewLRUCache(50, 0); + EnvOptions env_options; + WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size); + + std::unique_ptr versions; + std::unique_ptr wal_manager; + WriteController write_controller; + + versions.reset(new VersionSet(test->dbname_, &db_options, env_options, + table_cache.get(), &write_buffer_manager, + &write_controller, + /*block_cache_tracer=*/nullptr)); + + wal_manager.reset(new WalManager(db_options, env_options)); + + std::unique_ptr current_log_writer; + + for (size_t j = kWALFileOffset; j < wal_count + kWALFileOffset; j++) { + uint64_t current_log_number = j; + std::string fname = LogFileName(test->dbname_, current_log_number); + std::unique_ptr file; + ASSERT_OK(db_options.env->NewWritableFile(fname, &file, env_options)); + std::unique_ptr file_writer(new WritableFileWriter( + NewLegacyWritableFileWrapper(std::move(file)), fname, env_options)); + current_log_writer.reset( + new log::Writer(std::move(file_writer), current_log_number, + db_options.recycle_log_file_num > 0)); + + WriteBatch batch; + for (int i = 0; i < kKeysPerWALFile; i++) { + std::string key = "key" + ToString((*count)++); + std::string value = test->DummyString(kValueSize); + assert(current_log_writer.get() != nullptr); + uint64_t seq = versions->LastSequence() + 1; + batch.Clear(); + batch.Put(key, value); + WriteBatchInternal::SetSequence(&batch, seq); + current_log_writer->AddRecord(WriteBatchInternal::Contents(&batch)); + versions->SetLastAllocatedSequence(seq); + versions->SetLastPublishedSequence(seq); + versions->SetLastSequence(seq); + } + } + } + + // Recreate and fill the store with some data + static size_t FillData(DBWALTest* test, Options* options) { + options->create_if_missing = true; + test->DestroyAndReopen(*options); + test->Close(); + + size_t count = 0; + FillData(test, *options, kWALFilesCount, &count); + return count; + } + + // Read back all the keys we wrote and return the number of keys found + static size_t GetData(DBWALTest* test) { + size_t count = 0; + for (size_t i = 0; i < kWALFilesCount * kKeysPerWALFile; i++) { + if (test->Get("key" + ToString(i)) != "NOT_FOUND") { + ++count; + } + } + return count; + } + + // Manuall corrupt the specified WAL + static void CorruptWAL(DBWALTest* test, const Options& options, + const double off, const double len, + const int wal_file_id, const bool trunc = false) { + Env* env = options.env; + std::string fname = LogFileName(test->dbname_, wal_file_id); + uint64_t size; + ASSERT_OK(env->GetFileSize(fname, &size)); + ASSERT_GT(size, 0); +#ifdef OS_WIN + // Windows disk cache behaves differently. When we truncate + // the original content is still in the cache due to the original + // handle is still open. Generally, in Windows, one prohibits + // shared access to files and it is not needed for WAL but we allow + // it to induce corruption at various tests. + test->Close(); +#endif + if (trunc) { + ASSERT_EQ(0, truncate(fname.c_str(), static_cast(size * off))); + } else { + InduceCorruption(fname, static_cast(size * off + 8), + static_cast(size * len)); + } + } + + // Overwrite data with 'a' from offset for length len + static void InduceCorruption(const std::string& filename, size_t offset, + size_t len) { + ASSERT_GT(len, 0U); + + int fd = open(filename.c_str(), O_RDWR); + + // On windows long is 32-bit + ASSERT_LE(offset, std::numeric_limits::max()); + + ASSERT_GT(fd, 0); + ASSERT_EQ(offset, lseek(fd, static_cast(offset), SEEK_SET)); + + void* buf = alloca(len); + memset(buf, 'b', len); + ASSERT_EQ(len, write(fd, buf, static_cast(len))); + + close(fd); + } +}; + +// Test scope: +// - We expect to open the data store when there is incomplete trailing writes +// at the end of any of the logs +// - We do not expect to open the data store for corruption +TEST_F(DBWALTest, kTolerateCorruptedTailRecords) { + const int jstart = RecoveryTestHelper::kWALFileOffset; + const int jend = jstart + RecoveryTestHelper::kWALFilesCount; + + for (auto trunc : {true, false}) { /* Corruption style */ + for (int i = 0; i < 3; i++) { /* Corruption offset position */ + for (int j = jstart; j < jend; j++) { /* WAL file */ + // Fill data for testing + Options options = CurrentOptions(); + const size_t row_count = RecoveryTestHelper::FillData(this, &options); + // test checksum failure or parsing + RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3, + /*len%=*/.1, /*wal=*/j, trunc); + + if (trunc) { + options.wal_recovery_mode = + WALRecoveryMode::kTolerateCorruptedTailRecords; + options.create_if_missing = false; + ASSERT_OK(TryReopen(options)); + const size_t recovered_row_count = RecoveryTestHelper::GetData(this); + ASSERT_TRUE(i == 0 || recovered_row_count > 0); + ASSERT_LT(recovered_row_count, row_count); + } else { + options.wal_recovery_mode = + WALRecoveryMode::kTolerateCorruptedTailRecords; + ASSERT_NOK(TryReopen(options)); + } + } + } + } +} + +// Test scope: +// We don't expect the data store to be opened if there is any corruption +// (leading, middle or trailing -- incomplete writes or corruption) +TEST_F(DBWALTest, kAbsoluteConsistency) { + const int jstart = RecoveryTestHelper::kWALFileOffset; + const int jend = jstart + RecoveryTestHelper::kWALFilesCount; + + // Verify clean slate behavior + Options options = CurrentOptions(); + const size_t row_count = RecoveryTestHelper::FillData(this, &options); + options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency; + options.create_if_missing = false; + ASSERT_OK(TryReopen(options)); + ASSERT_EQ(RecoveryTestHelper::GetData(this), row_count); + + for (auto trunc : {true, false}) { /* Corruption style */ + for (int i = 0; i < 4; i++) { /* Corruption offset position */ + if (trunc && i == 0) { + continue; + } + + for (int j = jstart; j < jend; j++) { /* wal files */ + // fill with new date + RecoveryTestHelper::FillData(this, &options); + // corrupt the wal + RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3, + /*len%=*/.1, j, trunc); + // verify + options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency; + options.create_if_missing = false; + ASSERT_NOK(TryReopen(options)); + } + } + } +} + +// Test scope: +// We don't expect the data store to be opened if there is any inconsistency +// between WAL and SST files +TEST_F(DBWALTest, kPointInTimeRecoveryCFConsistency) { + Options options = CurrentOptions(); + options.avoid_flush_during_recovery = true; + + // Create DB with multiple column families. + CreateAndReopenWithCF({"one", "two"}, options); + ASSERT_OK(Put(1, "key1", "val1")); + ASSERT_OK(Put(2, "key2", "val2")); + + // Record the offset at this point + Env* env = options.env; + uint64_t wal_file_id = dbfull()->TEST_LogfileNumber(); + std::string fname = LogFileName(dbname_, wal_file_id); + uint64_t offset_to_corrupt; + ASSERT_OK(env->GetFileSize(fname, &offset_to_corrupt)); + ASSERT_GT(offset_to_corrupt, 0); + + ASSERT_OK(Put(1, "key3", "val3")); + // Corrupt WAL at location of key3 + RecoveryTestHelper::InduceCorruption( + fname, static_cast(offset_to_corrupt), static_cast(4)); + ASSERT_OK(Put(2, "key4", "val4")); + ASSERT_OK(Put(1, "key5", "val5")); + Flush(2); + + // PIT recovery & verify + options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + ASSERT_NOK(TryReopenWithColumnFamilies({"default", "one", "two"}, options)); +} + +// Test scope: +// - We expect to open data store under all circumstances +// - We expect only data upto the point where the first error was encountered +TEST_F(DBWALTest, kPointInTimeRecovery) { + const int jstart = RecoveryTestHelper::kWALFileOffset; + const int jend = jstart + RecoveryTestHelper::kWALFilesCount; + const int maxkeys = + RecoveryTestHelper::kWALFilesCount * RecoveryTestHelper::kKeysPerWALFile; + + for (auto trunc : {true, false}) { /* Corruption style */ + for (int i = 0; i < 4; i++) { /* Offset of corruption */ + for (int j = jstart; j < jend; j++) { /* WAL file */ + // Fill data for testing + Options options = CurrentOptions(); + const size_t row_count = RecoveryTestHelper::FillData(this, &options); + + // Corrupt the wal + RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3, + /*len%=*/.1, j, trunc); + + // Verify + options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + options.create_if_missing = false; + ASSERT_OK(TryReopen(options)); + + // Probe data for invariants + size_t recovered_row_count = RecoveryTestHelper::GetData(this); + ASSERT_LT(recovered_row_count, row_count); + + bool expect_data = true; + for (size_t k = 0; k < maxkeys; ++k) { + bool found = Get("key" + ToString(i)) != "NOT_FOUND"; + if (expect_data && !found) { + expect_data = false; + } + ASSERT_EQ(found, expect_data); + } + + const size_t min = RecoveryTestHelper::kKeysPerWALFile * + (j - RecoveryTestHelper::kWALFileOffset); + ASSERT_GE(recovered_row_count, min); + if (!trunc && i != 0) { + const size_t max = RecoveryTestHelper::kKeysPerWALFile * + (j - RecoveryTestHelper::kWALFileOffset + 1); + ASSERT_LE(recovered_row_count, max); + } + } + } + } +} + +// Test scope: +// - We expect to open the data store under all scenarios +// - We expect to have recovered records past the corruption zone +TEST_F(DBWALTest, kSkipAnyCorruptedRecords) { + const int jstart = RecoveryTestHelper::kWALFileOffset; + const int jend = jstart + RecoveryTestHelper::kWALFilesCount; + + for (auto trunc : {true, false}) { /* Corruption style */ + for (int i = 0; i < 4; i++) { /* Corruption offset */ + for (int j = jstart; j < jend; j++) { /* wal files */ + // Fill data for testing + Options options = CurrentOptions(); + const size_t row_count = RecoveryTestHelper::FillData(this, &options); + + // Corrupt the WAL + RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3, + /*len%=*/.1, j, trunc); + + // Verify behavior + options.wal_recovery_mode = WALRecoveryMode::kSkipAnyCorruptedRecords; + options.create_if_missing = false; + ASSERT_OK(TryReopen(options)); + + // Probe data for invariants + size_t recovered_row_count = RecoveryTestHelper::GetData(this); + ASSERT_LT(recovered_row_count, row_count); + + if (!trunc) { + ASSERT_TRUE(i != 0 || recovered_row_count > 0); + } + } + } + } +} + +TEST_F(DBWALTest, AvoidFlushDuringRecovery) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.avoid_flush_during_recovery = false; + + // Test with flush after recovery. + Reopen(options); + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Put("bar", "v2")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "v3")); + ASSERT_OK(Put("bar", "v4")); + ASSERT_EQ(1, TotalTableFiles()); + // Reopen DB. Check if WAL logs flushed. + Reopen(options); + ASSERT_EQ("v3", Get("foo")); + ASSERT_EQ("v4", Get("bar")); + ASSERT_EQ(2, TotalTableFiles()); + + // Test without flush after recovery. + options.avoid_flush_during_recovery = true; + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "v5")); + ASSERT_OK(Put("bar", "v6")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "v7")); + ASSERT_OK(Put("bar", "v8")); + ASSERT_EQ(1, TotalTableFiles()); + // Reopen DB. WAL logs should not be flushed this time. + Reopen(options); + ASSERT_EQ("v7", Get("foo")); + ASSERT_EQ("v8", Get("bar")); + ASSERT_EQ(1, TotalTableFiles()); + + // Force flush with allow_2pc. + options.avoid_flush_during_recovery = true; + options.allow_2pc = true; + ASSERT_OK(Put("foo", "v9")); + ASSERT_OK(Put("bar", "v10")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "v11")); + ASSERT_OK(Put("bar", "v12")); + Reopen(options); + ASSERT_EQ("v11", Get("foo")); + ASSERT_EQ("v12", Get("bar")); + ASSERT_EQ(3, TotalTableFiles()); +} + +TEST_F(DBWALTest, WalCleanupAfterAvoidFlushDuringRecovery) { + // Verifies WAL files that were present during recovery, but not flushed due + // to avoid_flush_during_recovery, will be considered for deletion at a later + // stage. We check at least one such file is deleted during Flush(). + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.avoid_flush_during_recovery = true; + Reopen(options); + + ASSERT_OK(Put("foo", "v1")); + Reopen(options); + for (int i = 0; i < 2; ++i) { + if (i > 0) { + // Flush() triggers deletion of obsolete tracked files + Flush(); + } + VectorLogPtr log_files; + ASSERT_OK(dbfull()->GetSortedWalFiles(log_files)); + if (i == 0) { + ASSERT_GT(log_files.size(), 0); + } else { + ASSERT_EQ(0, log_files.size()); + } + } +} + +TEST_F(DBWALTest, RecoverWithoutFlush) { + Options options = CurrentOptions(); + options.avoid_flush_during_recovery = true; + options.create_if_missing = false; + options.disable_auto_compactions = true; + options.write_buffer_size = 64 * 1024 * 1024; + + size_t count = RecoveryTestHelper::FillData(this, &options); + auto validateData = [this, count]() { + for (size_t i = 0; i < count; i++) { + ASSERT_NE(Get("key" + ToString(i)), "NOT_FOUND"); + } + }; + Reopen(options); + validateData(); + // Insert some data without flush + ASSERT_OK(Put("foo", "foo_v1")); + ASSERT_OK(Put("bar", "bar_v1")); + Reopen(options); + validateData(); + ASSERT_EQ(Get("foo"), "foo_v1"); + ASSERT_EQ(Get("bar"), "bar_v1"); + // Insert again and reopen + ASSERT_OK(Put("foo", "foo_v2")); + ASSERT_OK(Put("bar", "bar_v2")); + Reopen(options); + validateData(); + ASSERT_EQ(Get("foo"), "foo_v2"); + ASSERT_EQ(Get("bar"), "bar_v2"); + // manual flush and insert again + Flush(); + ASSERT_EQ(Get("foo"), "foo_v2"); + ASSERT_EQ(Get("bar"), "bar_v2"); + ASSERT_OK(Put("foo", "foo_v3")); + ASSERT_OK(Put("bar", "bar_v3")); + Reopen(options); + validateData(); + ASSERT_EQ(Get("foo"), "foo_v3"); + ASSERT_EQ(Get("bar"), "bar_v3"); +} + +TEST_F(DBWALTest, RecoverWithoutFlushMultipleCF) { + const std::string kSmallValue = "v"; + const std::string kLargeValue = DummyString(1024); + Options options = CurrentOptions(); + options.avoid_flush_during_recovery = true; + options.create_if_missing = false; + options.disable_auto_compactions = true; + + auto countWalFiles = [this]() { + VectorLogPtr log_files; + dbfull()->GetSortedWalFiles(log_files); + return log_files.size(); + }; + + // Create DB with multiple column families and multiple log files. + CreateAndReopenWithCF({"one", "two"}, options); + ASSERT_OK(Put(0, "key1", kSmallValue)); + ASSERT_OK(Put(1, "key2", kLargeValue)); + Flush(1); + ASSERT_EQ(1, countWalFiles()); + ASSERT_OK(Put(0, "key3", kSmallValue)); + ASSERT_OK(Put(2, "key4", kLargeValue)); + Flush(2); + ASSERT_EQ(2, countWalFiles()); + + // Reopen, insert and flush. + options.db_write_buffer_size = 64 * 1024 * 1024; + ReopenWithColumnFamilies({"default", "one", "two"}, options); + ASSERT_EQ(Get(0, "key1"), kSmallValue); + ASSERT_EQ(Get(1, "key2"), kLargeValue); + ASSERT_EQ(Get(0, "key3"), kSmallValue); + ASSERT_EQ(Get(2, "key4"), kLargeValue); + // Insert more data. + ASSERT_OK(Put(0, "key5", kLargeValue)); + ASSERT_OK(Put(1, "key6", kLargeValue)); + ASSERT_EQ(3, countWalFiles()); + Flush(1); + ASSERT_OK(Put(2, "key7", kLargeValue)); + dbfull()->FlushWAL(false); + ASSERT_EQ(4, countWalFiles()); + + // Reopen twice and validate. + for (int i = 0; i < 2; i++) { + ReopenWithColumnFamilies({"default", "one", "two"}, options); + ASSERT_EQ(Get(0, "key1"), kSmallValue); + ASSERT_EQ(Get(1, "key2"), kLargeValue); + ASSERT_EQ(Get(0, "key3"), kSmallValue); + ASSERT_EQ(Get(2, "key4"), kLargeValue); + ASSERT_EQ(Get(0, "key5"), kLargeValue); + ASSERT_EQ(Get(1, "key6"), kLargeValue); + ASSERT_EQ(Get(2, "key7"), kLargeValue); + ASSERT_EQ(4, countWalFiles()); + } +} + +// In this test we are trying to do the following: +// 1. Create a DB with corrupted WAL log; +// 2. Open with avoid_flush_during_recovery = true; +// 3. Append more data without flushing, which creates new WAL log. +// 4. Open again. See if it can correctly handle previous corruption. +TEST_F(DBWALTest, RecoverFromCorruptedWALWithoutFlush) { + const int jstart = RecoveryTestHelper::kWALFileOffset; + const int jend = jstart + RecoveryTestHelper::kWALFilesCount; + const int kAppendKeys = 100; + Options options = CurrentOptions(); + options.avoid_flush_during_recovery = true; + options.create_if_missing = false; + options.disable_auto_compactions = true; + options.write_buffer_size = 64 * 1024 * 1024; + + auto getAll = [this]() { + std::vector> data; + ReadOptions ropt; + Iterator* iter = dbfull()->NewIterator(ropt); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + data.push_back( + std::make_pair(iter->key().ToString(), iter->value().ToString())); + } + delete iter; + return data; + }; + for (auto& mode : wal_recovery_mode_string_map) { + options.wal_recovery_mode = mode.second; + for (auto trunc : {true, false}) { + for (int i = 0; i < 4; i++) { + for (int j = jstart; j < jend; j++) { + // Create corrupted WAL + RecoveryTestHelper::FillData(this, &options); + RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3, + /*len%=*/.1, /*wal=*/j, trunc); + // Skip the test if DB won't open. + if (!TryReopen(options).ok()) { + ASSERT_TRUE(options.wal_recovery_mode == + WALRecoveryMode::kAbsoluteConsistency || + (!trunc && + options.wal_recovery_mode == + WALRecoveryMode::kTolerateCorruptedTailRecords)); + continue; + } + ASSERT_OK(TryReopen(options)); + // Append some more data. + for (int k = 0; k < kAppendKeys; k++) { + std::string key = "extra_key" + ToString(k); + std::string value = DummyString(RecoveryTestHelper::kValueSize); + ASSERT_OK(Put(key, value)); + } + // Save data for comparison. + auto data = getAll(); + // Reopen. Verify data. + ASSERT_OK(TryReopen(options)); + auto actual_data = getAll(); + ASSERT_EQ(data, actual_data); + } + } + } + } +} + +// Tests that total log size is recovered if we set +// avoid_flush_during_recovery=true. +// Flush should trigger if max_total_wal_size is reached. +TEST_F(DBWALTest, RestoreTotalLogSizeAfterRecoverWithoutFlush) { + class TestFlushListener : public EventListener { + public: + std::atomic count{0}; + + TestFlushListener() = default; + + void OnFlushBegin(DB* /*db*/, const FlushJobInfo& flush_job_info) override { + count++; + assert(FlushReason::kWriteBufferManager == flush_job_info.flush_reason); + } + }; + std::shared_ptr test_listener = + std::make_shared(); + + constexpr size_t kKB = 1024; + constexpr size_t kMB = 1024 * 1024; + Options options = CurrentOptions(); + options.avoid_flush_during_recovery = true; + options.max_total_wal_size = 1 * kMB; + options.listeners.push_back(test_listener); + // Have to open DB in multi-CF mode to trigger flush when + // max_total_wal_size is reached. + CreateAndReopenWithCF({"one"}, options); + // Write some keys and we will end up with one log file which is slightly + // smaller than 1MB. + std::string value_100k(100 * kKB, 'v'); + std::string value_300k(300 * kKB, 'v'); + ASSERT_OK(Put(0, "foo", "v1")); + for (int i = 0; i < 9; i++) { + ASSERT_OK(Put(1, "key" + ToString(i), value_100k)); + } + // Get log files before reopen. + VectorLogPtr log_files_before; + ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before)); + ASSERT_EQ(1, log_files_before.size()); + uint64_t log_size_before = log_files_before[0]->SizeFileBytes(); + ASSERT_GT(log_size_before, 900 * kKB); + ASSERT_LT(log_size_before, 1 * kMB); + ReopenWithColumnFamilies({"default", "one"}, options); + // Write one more value to make log larger than 1MB. + ASSERT_OK(Put(1, "bar", value_300k)); + // Get log files again. A new log file will be opened. + VectorLogPtr log_files_after_reopen; + ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_after_reopen)); + ASSERT_EQ(2, log_files_after_reopen.size()); + ASSERT_EQ(log_files_before[0]->LogNumber(), + log_files_after_reopen[0]->LogNumber()); + ASSERT_GT(log_files_after_reopen[0]->SizeFileBytes() + + log_files_after_reopen[1]->SizeFileBytes(), + 1 * kMB); + // Write one more key to trigger flush. + ASSERT_OK(Put(0, "foo", "v2")); + dbfull()->TEST_WaitForFlushMemTable(); + // Flushed two column families. + ASSERT_EQ(2, test_listener->count.load()); +} + +#if defined(ROCKSDB_PLATFORM_POSIX) +#if defined(ROCKSDB_FALLOCATE_PRESENT) +// Tests that we will truncate the preallocated space of the last log from +// previous. +TEST_F(DBWALTest, TruncateLastLogAfterRecoverWithoutFlush) { + constexpr size_t kKB = 1024; + Options options = CurrentOptions(); + options.avoid_flush_during_recovery = true; + DestroyAndReopen(options); + size_t preallocated_size = + dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size); + ASSERT_OK(Put("foo", "v1")); + VectorLogPtr log_files_before; + ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before)); + ASSERT_EQ(1, log_files_before.size()); + auto& file_before = log_files_before[0]; + ASSERT_LT(file_before->SizeFileBytes(), 1 * kKB); + // The log file has preallocated space. + ASSERT_GE(GetAllocatedFileSize(dbname_ + file_before->PathName()), + preallocated_size); + Reopen(options); + VectorLogPtr log_files_after; + ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_after)); + ASSERT_EQ(1, log_files_after.size()); + ASSERT_LT(log_files_after[0]->SizeFileBytes(), 1 * kKB); + // The preallocated space should be truncated. + ASSERT_LT(GetAllocatedFileSize(dbname_ + file_before->PathName()), + preallocated_size); +} +#endif // ROCKSDB_FALLOCATE_PRESENT +#endif // ROCKSDB_PLATFORM_POSIX + +#endif // ROCKSDB_LITE + +TEST_F(DBWALTest, WalTermTest) { + Options options = CurrentOptions(); + options.env = env_; + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "foo", "bar")); + + WriteOptions wo; + wo.sync = true; + wo.disableWAL = false; + + WriteBatch batch; + batch.Put("foo", "bar"); + batch.MarkWalTerminationPoint(); + batch.Put("foo2", "bar2"); + + ASSERT_OK(dbfull()->Write(wo, &batch)); + + // make sure we can re-open it. + ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options)); + ASSERT_EQ("bar", Get(1, "foo")); + ASSERT_EQ("NOT_FOUND", Get(1, "foo2")); +} +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/db_write_test.cc b/src/rocksdb/db/db_write_test.cc new file mode 100644 index 000000000..cc1aaac08 --- /dev/null +++ b/src/rocksdb/db/db_write_test.cc @@ -0,0 +1,329 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include +#include +#include +#include +#include +#include "db/db_test_util.h" +#include "db/write_batch_internal.h" +#include "db/write_thread.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "test_util/fault_injection_test_env.h" +#include "test_util/sync_point.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +// Test variations of WriteImpl. +class DBWriteTest : public DBTestBase, public testing::WithParamInterface { + public: + DBWriteTest() : DBTestBase("/db_write_test") {} + + Options GetOptions() { return DBTestBase::GetOptions(GetParam()); } + + void Open() { DBTestBase::Reopen(GetOptions()); } +}; + +// It is invalid to do sync write while disabling WAL. +TEST_P(DBWriteTest, SyncAndDisableWAL) { + WriteOptions write_options; + write_options.sync = true; + write_options.disableWAL = true; + ASSERT_TRUE(dbfull()->Put(write_options, "foo", "bar").IsInvalidArgument()); + WriteBatch batch; + ASSERT_OK(batch.Put("foo", "bar")); + ASSERT_TRUE(dbfull()->Write(write_options, &batch).IsInvalidArgument()); +} + +TEST_P(DBWriteTest, WriteThreadHangOnWriteStall) { + Options options = GetOptions(); + options.level0_stop_writes_trigger = options.level0_slowdown_writes_trigger = 4; + std::vector threads; + std::atomic thread_num(0); + port::Mutex mutex; + port::CondVar cv(&mutex); + + Reopen(options); + + std::function write_slowdown_func = [&]() { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + WriteOptions wo; + wo.no_slowdown = false; + dbfull()->Put(wo, key, "bar"); + }; + std::function write_no_slowdown_func = [&]() { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + WriteOptions wo; + wo.no_slowdown = true; + dbfull()->Put(wo, key, "bar"); + }; + std::function unblock_main_thread_func = [&](void *) { + mutex.Lock(); + cv.SignalAll(); + mutex.Unlock(); + }; + + // Create 3 L0 files and schedule 4th without waiting + Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"); + Flush(); + Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"); + Flush(); + Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"); + Flush(); + Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::JoinBatchGroup:Start", unblock_main_thread_func); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBWriteTest::WriteThreadHangOnWriteStall:1", + "DBImpl::BackgroundCallFlush:start"}, + {"DBWriteTest::WriteThreadHangOnWriteStall:2", + "DBImpl::WriteImpl:BeforeLeaderEnters"}, + // Make compaction start wait for the write stall to be detected and + // implemented by a write group leader + {"DBWriteTest::WriteThreadHangOnWriteStall:3", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Schedule creation of 4th L0 file without waiting. This will seal the + // memtable and then wait for a sync point before writing the file. We need + // to do it this way because SwitchMemtable() needs to enter the + // write_thread + FlushOptions fopt; + fopt.wait = false; + dbfull()->Flush(fopt); + + // Create a mix of slowdown/no_slowdown write threads + mutex.Lock(); + // First leader + threads.emplace_back(write_slowdown_func); + cv.Wait(); + // Second leader. Will stall writes + threads.emplace_back(write_slowdown_func); + cv.Wait(); + threads.emplace_back(write_no_slowdown_func); + cv.Wait(); + threads.emplace_back(write_slowdown_func); + cv.Wait(); + threads.emplace_back(write_no_slowdown_func); + cv.Wait(); + threads.emplace_back(write_slowdown_func); + cv.Wait(); + mutex.Unlock(); + + TEST_SYNC_POINT("DBWriteTest::WriteThreadHangOnWriteStall:1"); + dbfull()->TEST_WaitForFlushMemTable(nullptr); + // This would have triggered a write stall. Unblock the write group leader + TEST_SYNC_POINT("DBWriteTest::WriteThreadHangOnWriteStall:2"); + // The leader is going to create missing newer links. When the leader finishes, + // the next leader is going to delay writes and fail writers with no_slowdown + + TEST_SYNC_POINT("DBWriteTest::WriteThreadHangOnWriteStall:3"); + for (auto& t : threads) { + t.join(); + } +} + +TEST_P(DBWriteTest, IOErrorOnWALWritePropagateToWriteThreadFollower) { + constexpr int kNumThreads = 5; + std::unique_ptr mock_env( + new FaultInjectionTestEnv(Env::Default())); + Options options = GetOptions(); + options.env = mock_env.get(); + Reopen(options); + std::atomic ready_count{0}; + std::atomic leader_count{0}; + std::vector threads; + mock_env->SetFilesystemActive(false); + + // Wait until all threads linked to write threads, to make sure + // all threads join the same batch group. + SyncPoint::GetInstance()->SetCallBack( + "WriteThread::JoinBatchGroup:Wait", [&](void* arg) { + ready_count++; + auto* w = reinterpret_cast(arg); + if (w->state == WriteThread::STATE_GROUP_LEADER) { + leader_count++; + while (ready_count < kNumThreads) { + // busy waiting + } + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + for (int i = 0; i < kNumThreads; i++) { + threads.push_back(port::Thread( + [&](int index) { + // All threads should fail. + auto res = Put("key" + ToString(index), "value"); + if (options.manual_wal_flush) { + ASSERT_TRUE(res.ok()); + // we should see fs error when we do the flush + + // TSAN reports a false alarm for lock-order-inversion but Open and + // FlushWAL are not run concurrently. Disabling this until TSAN is + // fixed. + // res = dbfull()->FlushWAL(false); + // ASSERT_FALSE(res.ok()); + } else { + ASSERT_FALSE(res.ok()); + } + }, + i)); + } + for (int i = 0; i < kNumThreads; i++) { + threads[i].join(); + } + ASSERT_EQ(1, leader_count); + // Close before mock_env destruct. + Close(); +} + +TEST_P(DBWriteTest, ManualWalFlushInEffect) { + Options options = GetOptions(); + Reopen(options); + // try the 1st WAL created during open + ASSERT_TRUE(Put("key" + ToString(0), "value").ok()); + ASSERT_TRUE(options.manual_wal_flush != dbfull()->TEST_WALBufferIsEmpty()); + ASSERT_TRUE(dbfull()->FlushWAL(false).ok()); + ASSERT_TRUE(dbfull()->TEST_WALBufferIsEmpty()); + // try the 2nd wal created during SwitchWAL + dbfull()->TEST_SwitchWAL(); + ASSERT_TRUE(Put("key" + ToString(0), "value").ok()); + ASSERT_TRUE(options.manual_wal_flush != dbfull()->TEST_WALBufferIsEmpty()); + ASSERT_TRUE(dbfull()->FlushWAL(false).ok()); + ASSERT_TRUE(dbfull()->TEST_WALBufferIsEmpty()); +} + +TEST_P(DBWriteTest, IOErrorOnWALWriteTriggersReadOnlyMode) { + std::unique_ptr mock_env( + new FaultInjectionTestEnv(Env::Default())); + Options options = GetOptions(); + options.env = mock_env.get(); + Reopen(options); + for (int i = 0; i < 2; i++) { + // Forcibly fail WAL write for the first Put only. Subsequent Puts should + // fail due to read-only mode + mock_env->SetFilesystemActive(i != 0); + auto res = Put("key" + ToString(i), "value"); + // TSAN reports a false alarm for lock-order-inversion but Open and + // FlushWAL are not run concurrently. Disabling this until TSAN is + // fixed. + /* + if (options.manual_wal_flush && i == 0) { + // even with manual_wal_flush the 2nd Put should return error because of + // the read-only mode + ASSERT_TRUE(res.ok()); + // we should see fs error when we do the flush + res = dbfull()->FlushWAL(false); + } + */ + if (!options.manual_wal_flush) { + ASSERT_FALSE(res.ok()); + } + } + // Close before mock_env destruct. + Close(); +} + +TEST_P(DBWriteTest, IOErrorOnSwitchMemtable) { + Random rnd(301); + std::unique_ptr mock_env( + new FaultInjectionTestEnv(Env::Default())); + Options options = GetOptions(); + options.env = mock_env.get(); + options.writable_file_max_buffer_size = 4 * 1024 * 1024; + options.write_buffer_size = 3 * 512 * 1024; + options.wal_bytes_per_sync = 256 * 1024; + options.manual_wal_flush = true; + Reopen(options); + mock_env->SetFilesystemActive(false, Status::IOError("Not active")); + Status s; + for (int i = 0; i < 4 * 512; ++i) { + s = Put(Key(i), RandomString(&rnd, 1024)); + if (!s.ok()) { + break; + } + } + ASSERT_EQ(s.severity(), Status::Severity::kFatalError); + + mock_env->SetFilesystemActive(true); + // Close before mock_env destruct. + Close(); +} + +// Test that db->LockWAL() flushes the WAL after locking. +TEST_P(DBWriteTest, LockWalInEffect) { + Options options = GetOptions(); + Reopen(options); + // try the 1st WAL created during open + ASSERT_OK(Put("key" + ToString(0), "value")); + ASSERT_TRUE(options.manual_wal_flush != dbfull()->TEST_WALBufferIsEmpty()); + ASSERT_OK(dbfull()->LockWAL()); + ASSERT_TRUE(dbfull()->TEST_WALBufferIsEmpty(false)); + ASSERT_OK(dbfull()->UnlockWAL()); + // try the 2nd wal created during SwitchWAL + dbfull()->TEST_SwitchWAL(); + ASSERT_OK(Put("key" + ToString(0), "value")); + ASSERT_TRUE(options.manual_wal_flush != dbfull()->TEST_WALBufferIsEmpty()); + ASSERT_OK(dbfull()->LockWAL()); + ASSERT_TRUE(dbfull()->TEST_WALBufferIsEmpty(false)); + ASSERT_OK(dbfull()->UnlockWAL()); +} + +TEST_P(DBWriteTest, ConcurrentlyDisabledWAL) { + Options options = GetOptions(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.statistics->set_stats_level(StatsLevel::kAll); + Reopen(options); + std::string wal_key_prefix = "WAL_KEY_"; + std::string no_wal_key_prefix = "K_"; + // 100 KB value each for NO-WAL operation + std::string no_wal_value(1024 * 100, 'X'); + // 1B value each for WAL operation + std::string wal_value = "0"; + std::thread threads[10]; + for (int t = 0; t < 10; t++) { + threads[t] = std::thread([t, wal_key_prefix, wal_value, no_wal_key_prefix, no_wal_value, this] { + for(int i = 0; i < 10; i++) { + ROCKSDB_NAMESPACE::WriteOptions write_option_disable; + write_option_disable.disableWAL = true; + ROCKSDB_NAMESPACE::WriteOptions write_option_default; + std::string no_wal_key = no_wal_key_prefix + std::to_string(t) + + "_" + std::to_string(i); + this->Put(no_wal_key, no_wal_value, write_option_disable); + std::string wal_key = + wal_key_prefix + std::to_string(i) + "_" + std::to_string(i); + this->Put(wal_key, wal_value, write_option_default); + dbfull()->SyncWAL(); + } + return 0; + }); + } + for (auto& t: threads) { + t.join(); + } + uint64_t bytes_num = options.statistics->getTickerCount( + ROCKSDB_NAMESPACE::Tickers::WAL_FILE_BYTES); + // written WAL size should less than 100KB (even included HEADER & FOOTER overhead) + ASSERT_LE(bytes_num, 1024 * 100); +} + +INSTANTIATE_TEST_CASE_P(DBWriteTestInstance, DBWriteTest, + testing::Values(DBTestBase::kDefault, + DBTestBase::kConcurrentWALWrites, + DBTestBase::kPipelinedWrite)); + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/dbformat.cc b/src/rocksdb/db/dbformat.cc new file mode 100644 index 000000000..e10af2b85 --- /dev/null +++ b/src/rocksdb/db/dbformat.cc @@ -0,0 +1,197 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "db/dbformat.h" + +#include +#include +#include "monitoring/perf_context_imp.h" +#include "port/port.h" +#include "util/coding.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +// kValueTypeForSeek defines the ValueType that should be passed when +// constructing a ParsedInternalKey object for seeking to a particular +// sequence number (since we sort sequence numbers in decreasing order +// and the value type is embedded as the low 8 bits in the sequence +// number in internal keys, we need to use the highest-numbered +// ValueType, not the lowest). +const ValueType kValueTypeForSeek = kTypeBlobIndex; +const ValueType kValueTypeForSeekForPrev = kTypeDeletion; + +uint64_t PackSequenceAndType(uint64_t seq, ValueType t) { + assert(seq <= kMaxSequenceNumber); + assert(IsExtendedValueType(t)); + return (seq << 8) | t; +} + +EntryType GetEntryType(ValueType value_type) { + switch (value_type) { + case kTypeValue: + return kEntryPut; + case kTypeDeletion: + return kEntryDelete; + case kTypeSingleDeletion: + return kEntrySingleDelete; + case kTypeMerge: + return kEntryMerge; + case kTypeRangeDeletion: + return kEntryRangeDeletion; + case kTypeBlobIndex: + return kEntryBlobIndex; + default: + return kEntryOther; + } +} + +bool ParseFullKey(const Slice& internal_key, FullKey* fkey) { + ParsedInternalKey ikey; + if (!ParseInternalKey(internal_key, &ikey)) { + return false; + } + fkey->user_key = ikey.user_key; + fkey->sequence = ikey.sequence; + fkey->type = GetEntryType(ikey.type); + return true; +} + +void UnPackSequenceAndType(uint64_t packed, uint64_t* seq, ValueType* t) { + *seq = packed >> 8; + *t = static_cast(packed & 0xff); + + assert(*seq <= kMaxSequenceNumber); + assert(IsExtendedValueType(*t)); +} + +void AppendInternalKey(std::string* result, const ParsedInternalKey& key) { + result->append(key.user_key.data(), key.user_key.size()); + PutFixed64(result, PackSequenceAndType(key.sequence, key.type)); +} + +void AppendInternalKeyFooter(std::string* result, SequenceNumber s, + ValueType t) { + PutFixed64(result, PackSequenceAndType(s, t)); +} + +std::string ParsedInternalKey::DebugString(bool hex) const { + char buf[50]; + snprintf(buf, sizeof(buf), "' seq:%" PRIu64 ", type:%d", sequence, + static_cast(type)); + std::string result = "'"; + result += user_key.ToString(hex); + result += buf; + return result; +} + +std::string InternalKey::DebugString(bool hex) const { + std::string result; + ParsedInternalKey parsed; + if (ParseInternalKey(rep_, &parsed)) { + result = parsed.DebugString(hex); + } else { + result = "(bad)"; + result.append(EscapeString(rep_)); + } + return result; +} + +const char* InternalKeyComparator::Name() const { return name_.c_str(); } + +int InternalKeyComparator::Compare(const ParsedInternalKey& a, + const ParsedInternalKey& b) const { + // Order by: + // increasing user key (according to user-supplied comparator) + // decreasing sequence number + // decreasing type (though sequence# should be enough to disambiguate) + int r = user_comparator_.Compare(a.user_key, b.user_key); + if (r == 0) { + if (a.sequence > b.sequence) { + r = -1; + } else if (a.sequence < b.sequence) { + r = +1; + } else if (a.type > b.type) { + r = -1; + } else if (a.type < b.type) { + r = +1; + } + } + return r; +} + +void InternalKeyComparator::FindShortestSeparator(std::string* start, + const Slice& limit) const { + // Attempt to shorten the user portion of the key + Slice user_start = ExtractUserKey(*start); + Slice user_limit = ExtractUserKey(limit); + std::string tmp(user_start.data(), user_start.size()); + user_comparator_.FindShortestSeparator(&tmp, user_limit); + if (tmp.size() <= user_start.size() && + user_comparator_.Compare(user_start, tmp) < 0) { + // User key has become shorter physically, but larger logically. + // Tack on the earliest possible number to the shortened user key. + PutFixed64(&tmp, + PackSequenceAndType(kMaxSequenceNumber, kValueTypeForSeek)); + assert(this->Compare(*start, tmp) < 0); + assert(this->Compare(tmp, limit) < 0); + start->swap(tmp); + } +} + +void InternalKeyComparator::FindShortSuccessor(std::string* key) const { + Slice user_key = ExtractUserKey(*key); + std::string tmp(user_key.data(), user_key.size()); + user_comparator_.FindShortSuccessor(&tmp); + if (tmp.size() <= user_key.size() && + user_comparator_.Compare(user_key, tmp) < 0) { + // User key has become shorter physically, but larger logically. + // Tack on the earliest possible number to the shortened user key. + PutFixed64(&tmp, + PackSequenceAndType(kMaxSequenceNumber, kValueTypeForSeek)); + assert(this->Compare(*key, tmp) < 0); + key->swap(tmp); + } +} + +LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s, + const Slice* ts) { + size_t usize = _user_key.size(); + size_t ts_sz = (nullptr == ts) ? 0 : ts->size(); + size_t needed = usize + ts_sz + 13; // A conservative estimate + char* dst; + if (needed <= sizeof(space_)) { + dst = space_; + } else { + dst = new char[needed]; + } + start_ = dst; + // NOTE: We don't support users keys of more than 2GB :) + dst = EncodeVarint32(dst, static_cast(usize + ts_sz + 8)); + kstart_ = dst; + memcpy(dst, _user_key.data(), usize); + dst += usize; + if (nullptr != ts) { + memcpy(dst, ts->data(), ts_sz); + dst += ts_sz; + } + EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek)); + dst += 8; + end_ = dst; +} + +void IterKey::EnlargeBuffer(size_t key_size) { + // If size is smaller than buffer size, continue using current buffer, + // or the static allocated one, as default + assert(key_size > buf_size_); + // Need to enlarge the buffer. + ResetBuffer(); + buf_ = new char[key_size]; + buf_size_ = key_size; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/dbformat.h b/src/rocksdb/db/dbformat.h new file mode 100644 index 000000000..de98be8df --- /dev/null +++ b/src/rocksdb/db/dbformat.h @@ -0,0 +1,671 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include +#include +#include "db/lookup_key.h" +#include "db/merge_context.h" +#include "logging/logging.h" +#include "monitoring/perf_context_imp.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "rocksdb/types.h" +#include "util/coding.h" +#include "util/user_comparator_wrapper.h" + +namespace ROCKSDB_NAMESPACE { + +// The file declares data structures and functions that deal with internal +// keys. +// Each internal key contains a user key, a sequence number (SequenceNumber) +// and a type (ValueType), and they are usually encoded together. +// There are some related helper classes here. + +class InternalKey; + +// Value types encoded as the last component of internal keys. +// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk +// data structures. +// The highest bit of the value type needs to be reserved to SST tables +// for them to do more flexible encoding. +enum ValueType : unsigned char { + kTypeDeletion = 0x0, + kTypeValue = 0x1, + kTypeMerge = 0x2, + kTypeLogData = 0x3, // WAL only. + kTypeColumnFamilyDeletion = 0x4, // WAL only. + kTypeColumnFamilyValue = 0x5, // WAL only. + kTypeColumnFamilyMerge = 0x6, // WAL only. + kTypeSingleDeletion = 0x7, + kTypeColumnFamilySingleDeletion = 0x8, // WAL only. + kTypeBeginPrepareXID = 0x9, // WAL only. + kTypeEndPrepareXID = 0xA, // WAL only. + kTypeCommitXID = 0xB, // WAL only. + kTypeRollbackXID = 0xC, // WAL only. + kTypeNoop = 0xD, // WAL only. + kTypeColumnFamilyRangeDeletion = 0xE, // WAL only. + kTypeRangeDeletion = 0xF, // meta block + kTypeColumnFamilyBlobIndex = 0x10, // Blob DB only + kTypeBlobIndex = 0x11, // Blob DB only + // When the prepared record is also persisted in db, we use a different + // record. This is to ensure that the WAL that is generated by a WritePolicy + // is not mistakenly read by another, which would result into data + // inconsistency. + kTypeBeginPersistedPrepareXID = 0x12, // WAL only. + // Similar to kTypeBeginPersistedPrepareXID, this is to ensure that WAL + // generated by WriteUnprepared write policy is not mistakenly read by + // another. + kTypeBeginUnprepareXID = 0x13, // WAL only. + kMaxValue = 0x7F // Not used for storing records. +}; + +// Defined in dbformat.cc +extern const ValueType kValueTypeForSeek; +extern const ValueType kValueTypeForSeekForPrev; + +// Checks whether a type is an inline value type +// (i.e. a type used in memtable skiplist and sst file datablock). +inline bool IsValueType(ValueType t) { + return t <= kTypeMerge || t == kTypeSingleDeletion || t == kTypeBlobIndex; +} + +// Checks whether a type is from user operation +// kTypeRangeDeletion is in meta block so this API is separated from above +inline bool IsExtendedValueType(ValueType t) { + return IsValueType(t) || t == kTypeRangeDeletion; +} + +// We leave eight bits empty at the bottom so a type and sequence# +// can be packed together into 64-bits. +static const SequenceNumber kMaxSequenceNumber = ((0x1ull << 56) - 1); + +static const SequenceNumber kDisableGlobalSequenceNumber = port::kMaxUint64; + +// The data structure that represents an internal key in the way that user_key, +// sequence number and type are stored in separated forms. +struct ParsedInternalKey { + Slice user_key; + SequenceNumber sequence; + ValueType type; + + ParsedInternalKey() + : sequence(kMaxSequenceNumber) // Make code analyzer happy + {} // Intentionally left uninitialized (for speed) + ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t) + : user_key(u), sequence(seq), type(t) {} + std::string DebugString(bool hex = false) const; + + void clear() { + user_key.clear(); + sequence = 0; + type = kTypeDeletion; + } +}; + +// Return the length of the encoding of "key". +inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) { + return key.user_key.size() + 8; +} + +// Pack a sequence number and a ValueType into a uint64_t +extern uint64_t PackSequenceAndType(uint64_t seq, ValueType t); + +// Given the result of PackSequenceAndType, store the sequence number in *seq +// and the ValueType in *t. +extern void UnPackSequenceAndType(uint64_t packed, uint64_t* seq, ValueType* t); + +EntryType GetEntryType(ValueType value_type); + +// Append the serialization of "key" to *result. +extern void AppendInternalKey(std::string* result, + const ParsedInternalKey& key); +// Serialized internal key consists of user key followed by footer. +// This function appends the footer to *result, assuming that *result already +// contains the user key at the end. +extern void AppendInternalKeyFooter(std::string* result, SequenceNumber s, + ValueType t); + +// Attempt to parse an internal key from "internal_key". On success, +// stores the parsed data in "*result", and returns true. +// +// On error, returns false, leaves "*result" in an undefined state. +extern bool ParseInternalKey(const Slice& internal_key, + ParsedInternalKey* result); + +// Returns the user key portion of an internal key. +inline Slice ExtractUserKey(const Slice& internal_key) { + assert(internal_key.size() >= 8); + return Slice(internal_key.data(), internal_key.size() - 8); +} + +inline Slice ExtractUserKeyAndStripTimestamp(const Slice& internal_key, + size_t ts_sz) { + assert(internal_key.size() >= 8 + ts_sz); + return Slice(internal_key.data(), internal_key.size() - 8 - ts_sz); +} + +inline Slice StripTimestampFromUserKey(const Slice& user_key, size_t ts_sz) { + assert(user_key.size() >= ts_sz); + return Slice(user_key.data(), user_key.size() - ts_sz); +} + +inline uint64_t ExtractInternalKeyFooter(const Slice& internal_key) { + assert(internal_key.size() >= 8); + const size_t n = internal_key.size(); + return DecodeFixed64(internal_key.data() + n - 8); +} + +inline ValueType ExtractValueType(const Slice& internal_key) { + uint64_t num = ExtractInternalKeyFooter(internal_key); + unsigned char c = num & 0xff; + return static_cast(c); +} + +// A comparator for internal keys that uses a specified comparator for +// the user key portion and breaks ties by decreasing sequence number. +class InternalKeyComparator +#ifdef NDEBUG + final +#endif + : public Comparator { + private: + UserComparatorWrapper user_comparator_; + std::string name_; + + public: + explicit InternalKeyComparator(const Comparator* c) + : user_comparator_(c), + name_("rocksdb.InternalKeyComparator:" + + std::string(user_comparator_.Name())) {} + virtual ~InternalKeyComparator() {} + + virtual const char* Name() const override; + virtual int Compare(const Slice& a, const Slice& b) const override; + // Same as Compare except that it excludes the value type from comparison + virtual int CompareKeySeq(const Slice& a, const Slice& b) const; + virtual void FindShortestSeparator(std::string* start, + const Slice& limit) const override; + virtual void FindShortSuccessor(std::string* key) const override; + + const Comparator* user_comparator() const { + return user_comparator_.user_comparator(); + } + + int Compare(const InternalKey& a, const InternalKey& b) const; + int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const; + virtual const Comparator* GetRootComparator() const override { + return user_comparator_.GetRootComparator(); + } +}; + +// The class represent the internal key in encoded form. +class InternalKey { + private: + std::string rep_; + + public: + InternalKey() {} // Leave rep_ as empty to indicate it is invalid + InternalKey(const Slice& _user_key, SequenceNumber s, ValueType t) { + AppendInternalKey(&rep_, ParsedInternalKey(_user_key, s, t)); + } + + // sets the internal key to be bigger or equal to all internal keys with this + // user key + void SetMaxPossibleForUserKey(const Slice& _user_key) { + AppendInternalKey( + &rep_, ParsedInternalKey(_user_key, 0, static_cast(0))); + } + + // sets the internal key to be smaller or equal to all internal keys with this + // user key + void SetMinPossibleForUserKey(const Slice& _user_key) { + AppendInternalKey(&rep_, ParsedInternalKey(_user_key, kMaxSequenceNumber, + kValueTypeForSeek)); + } + + bool Valid() const { + ParsedInternalKey parsed; + return ParseInternalKey(Slice(rep_), &parsed); + } + + void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); } + Slice Encode() const { + assert(!rep_.empty()); + return rep_; + } + + Slice user_key() const { return ExtractUserKey(rep_); } + size_t size() { return rep_.size(); } + + void Set(const Slice& _user_key, SequenceNumber s, ValueType t) { + SetFrom(ParsedInternalKey(_user_key, s, t)); + } + + void SetFrom(const ParsedInternalKey& p) { + rep_.clear(); + AppendInternalKey(&rep_, p); + } + + void Clear() { rep_.clear(); } + + // The underlying representation. + // Intended only to be used together with ConvertFromUserKey(). + std::string* rep() { return &rep_; } + + // Assuming that *rep() contains a user key, this method makes internal key + // out of it in-place. This saves a memcpy compared to Set()/SetFrom(). + void ConvertFromUserKey(SequenceNumber s, ValueType t) { + AppendInternalKeyFooter(&rep_, s, t); + } + + std::string DebugString(bool hex = false) const; +}; + +inline int InternalKeyComparator::Compare(const InternalKey& a, + const InternalKey& b) const { + return Compare(a.Encode(), b.Encode()); +} + +inline bool ParseInternalKey(const Slice& internal_key, + ParsedInternalKey* result) { + const size_t n = internal_key.size(); + if (n < 8) return false; + uint64_t num = DecodeFixed64(internal_key.data() + n - 8); + unsigned char c = num & 0xff; + result->sequence = num >> 8; + result->type = static_cast(c); + assert(result->type <= ValueType::kMaxValue); + result->user_key = Slice(internal_key.data(), n - 8); + return IsExtendedValueType(result->type); +} + +// Update the sequence number in the internal key. +// Guarantees not to invalidate ikey.data(). +inline void UpdateInternalKey(std::string* ikey, uint64_t seq, ValueType t) { + size_t ikey_sz = ikey->size(); + assert(ikey_sz >= 8); + uint64_t newval = (seq << 8) | t; + + // Note: Since C++11, strings are guaranteed to be stored contiguously and + // string::operator[]() is guaranteed not to change ikey.data(). + EncodeFixed64(&(*ikey)[ikey_sz - 8], newval); +} + +// Get the sequence number from the internal key +inline uint64_t GetInternalKeySeqno(const Slice& internal_key) { + const size_t n = internal_key.size(); + assert(n >= 8); + uint64_t num = DecodeFixed64(internal_key.data() + n - 8); + return num >> 8; +} + +// The class to store keys in an efficient way. It allows: +// 1. Users can either copy the key into it, or have it point to an unowned +// address. +// 2. For copied key, a short inline buffer is kept to reduce memory +// allocation for smaller keys. +// 3. It tracks user key or internal key, and allow conversion between them. +class IterKey { + public: + IterKey() + : buf_(space_), + key_(buf_), + key_size_(0), + buf_size_(sizeof(space_)), + is_user_key_(true) {} + // No copying allowed + IterKey(const IterKey&) = delete; + void operator=(const IterKey&) = delete; + + ~IterKey() { ResetBuffer(); } + + // The bool will be picked up by the next calls to SetKey + void SetIsUserKey(bool is_user_key) { is_user_key_ = is_user_key; } + + // Returns the key in whichever format that was provided to KeyIter + Slice GetKey() const { return Slice(key_, key_size_); } + + Slice GetInternalKey() const { + assert(!IsUserKey()); + return Slice(key_, key_size_); + } + + Slice GetUserKey() const { + if (IsUserKey()) { + return Slice(key_, key_size_); + } else { + assert(key_size_ >= 8); + return Slice(key_, key_size_ - 8); + } + } + + size_t Size() const { return key_size_; } + + void Clear() { key_size_ = 0; } + + // Append "non_shared_data" to its back, from "shared_len" + // This function is used in Block::Iter::ParseNextKey + // shared_len: bytes in [0, shard_len-1] would be remained + // non_shared_data: data to be append, its length must be >= non_shared_len + void TrimAppend(const size_t shared_len, const char* non_shared_data, + const size_t non_shared_len) { + assert(shared_len <= key_size_); + size_t total_size = shared_len + non_shared_len; + + if (IsKeyPinned() /* key is not in buf_ */) { + // Copy the key from external memory to buf_ (copy shared_len bytes) + EnlargeBufferIfNeeded(total_size); + memcpy(buf_, key_, shared_len); + } else if (total_size > buf_size_) { + // Need to allocate space, delete previous space + char* p = new char[total_size]; + memcpy(p, key_, shared_len); + + if (buf_ != space_) { + delete[] buf_; + } + + buf_ = p; + buf_size_ = total_size; + } + + memcpy(buf_ + shared_len, non_shared_data, non_shared_len); + key_ = buf_; + key_size_ = total_size; + } + + Slice SetKey(const Slice& key, bool copy = true) { + // is_user_key_ expected to be set already via SetIsUserKey + return SetKeyImpl(key, copy); + } + + Slice SetUserKey(const Slice& key, bool copy = true) { + is_user_key_ = true; + return SetKeyImpl(key, copy); + } + + Slice SetInternalKey(const Slice& key, bool copy = true) { + is_user_key_ = false; + return SetKeyImpl(key, copy); + } + + // Copies the content of key, updates the reference to the user key in ikey + // and returns a Slice referencing the new copy. + Slice SetInternalKey(const Slice& key, ParsedInternalKey* ikey) { + size_t key_n = key.size(); + assert(key_n >= 8); + SetInternalKey(key); + ikey->user_key = Slice(key_, key_n - 8); + return Slice(key_, key_n); + } + + // Copy the key into IterKey own buf_ + void OwnKey() { + assert(IsKeyPinned() == true); + + Reserve(key_size_); + memcpy(buf_, key_, key_size_); + key_ = buf_; + } + + // Update the sequence number in the internal key. Guarantees not to + // invalidate slices to the key (and the user key). + void UpdateInternalKey(uint64_t seq, ValueType t) { + assert(!IsKeyPinned()); + assert(key_size_ >= 8); + uint64_t newval = (seq << 8) | t; + EncodeFixed64(&buf_[key_size_ - 8], newval); + } + + bool IsKeyPinned() const { return (key_ != buf_); } + + void SetInternalKey(const Slice& key_prefix, const Slice& user_key, + SequenceNumber s, + ValueType value_type = kValueTypeForSeek) { + size_t psize = key_prefix.size(); + size_t usize = user_key.size(); + EnlargeBufferIfNeeded(psize + usize + sizeof(uint64_t)); + if (psize > 0) { + memcpy(buf_, key_prefix.data(), psize); + } + memcpy(buf_ + psize, user_key.data(), usize); + EncodeFixed64(buf_ + usize + psize, PackSequenceAndType(s, value_type)); + + key_ = buf_; + key_size_ = psize + usize + sizeof(uint64_t); + is_user_key_ = false; + } + + void SetInternalKey(const Slice& user_key, SequenceNumber s, + ValueType value_type = kValueTypeForSeek) { + SetInternalKey(Slice(), user_key, s, value_type); + } + + void Reserve(size_t size) { + EnlargeBufferIfNeeded(size); + key_size_ = size; + } + + void SetInternalKey(const ParsedInternalKey& parsed_key) { + SetInternalKey(Slice(), parsed_key); + } + + void SetInternalKey(const Slice& key_prefix, + const ParsedInternalKey& parsed_key_suffix) { + SetInternalKey(key_prefix, parsed_key_suffix.user_key, + parsed_key_suffix.sequence, parsed_key_suffix.type); + } + + void EncodeLengthPrefixedKey(const Slice& key) { + auto size = key.size(); + EnlargeBufferIfNeeded(size + static_cast(VarintLength(size))); + char* ptr = EncodeVarint32(buf_, static_cast(size)); + memcpy(ptr, key.data(), size); + key_ = buf_; + is_user_key_ = true; + } + + bool IsUserKey() const { return is_user_key_; } + + private: + char* buf_; + const char* key_; + size_t key_size_; + size_t buf_size_; + char space_[32]; // Avoid allocation for short keys + bool is_user_key_; + + Slice SetKeyImpl(const Slice& key, bool copy) { + size_t size = key.size(); + if (copy) { + // Copy key to buf_ + EnlargeBufferIfNeeded(size); + memcpy(buf_, key.data(), size); + key_ = buf_; + } else { + // Update key_ to point to external memory + key_ = key.data(); + } + key_size_ = size; + return Slice(key_, key_size_); + } + + void ResetBuffer() { + if (buf_ != space_) { + delete[] buf_; + buf_ = space_; + } + buf_size_ = sizeof(space_); + key_size_ = 0; + } + + // Enlarge the buffer size if needed based on key_size. + // By default, static allocated buffer is used. Once there is a key + // larger than the static allocated buffer, another buffer is dynamically + // allocated, until a larger key buffer is requested. In that case, we + // reallocate buffer and delete the old one. + void EnlargeBufferIfNeeded(size_t key_size) { + // If size is smaller than buffer size, continue using current buffer, + // or the static allocated one, as default + if (key_size > buf_size_) { + EnlargeBuffer(key_size); + } + } + + void EnlargeBuffer(size_t key_size); +}; + +// Convert from a SliceTranform of user keys, to a SliceTransform of +// user keys. +class InternalKeySliceTransform : public SliceTransform { + public: + explicit InternalKeySliceTransform(const SliceTransform* transform) + : transform_(transform) {} + + virtual const char* Name() const override { return transform_->Name(); } + + virtual Slice Transform(const Slice& src) const override { + auto user_key = ExtractUserKey(src); + return transform_->Transform(user_key); + } + + virtual bool InDomain(const Slice& src) const override { + auto user_key = ExtractUserKey(src); + return transform_->InDomain(user_key); + } + + virtual bool InRange(const Slice& dst) const override { + auto user_key = ExtractUserKey(dst); + return transform_->InRange(user_key); + } + + const SliceTransform* user_prefix_extractor() const { return transform_; } + + private: + // Like comparator, InternalKeySliceTransform will not take care of the + // deletion of transform_ + const SliceTransform* const transform_; +}; + +// Read the key of a record from a write batch. +// if this record represent the default column family then cf_record +// must be passed as false, otherwise it must be passed as true. +extern bool ReadKeyFromWriteBatchEntry(Slice* input, Slice* key, + bool cf_record); + +// Read record from a write batch piece from input. +// tag, column_family, key, value and blob are return values. Callers own the +// Slice they point to. +// Tag is defined as ValueType. +// input will be advanced to after the record. +extern Status ReadRecordFromWriteBatch(Slice* input, char* tag, + uint32_t* column_family, Slice* key, + Slice* value, Slice* blob, Slice* xid); + +// When user call DeleteRange() to delete a range of keys, +// we will store a serialized RangeTombstone in MemTable and SST. +// the struct here is a easy-understood form +// start/end_key_ is the start/end user key of the range to be deleted +struct RangeTombstone { + Slice start_key_; + Slice end_key_; + SequenceNumber seq_; + RangeTombstone() = default; + RangeTombstone(Slice sk, Slice ek, SequenceNumber sn) + : start_key_(sk), end_key_(ek), seq_(sn) {} + + RangeTombstone(ParsedInternalKey parsed_key, Slice value) { + start_key_ = parsed_key.user_key; + seq_ = parsed_key.sequence; + end_key_ = value; + } + + // be careful to use Serialize(), allocates new memory + std::pair Serialize() const { + auto key = InternalKey(start_key_, seq_, kTypeRangeDeletion); + Slice value = end_key_; + return std::make_pair(std::move(key), std::move(value)); + } + + // be careful to use SerializeKey(), allocates new memory + InternalKey SerializeKey() const { + return InternalKey(start_key_, seq_, kTypeRangeDeletion); + } + + // The tombstone end-key is exclusive, so we generate an internal-key here + // which has a similar property. Using kMaxSequenceNumber guarantees that + // the returned internal-key will compare less than any other internal-key + // with the same user-key. This in turn guarantees that the serialized + // end-key for a tombstone such as [a-b] will compare less than the key "b". + // + // be careful to use SerializeEndKey(), allocates new memory + InternalKey SerializeEndKey() const { + return InternalKey(end_key_, kMaxSequenceNumber, kTypeRangeDeletion); + } +}; + +inline int InternalKeyComparator::Compare(const Slice& akey, + const Slice& bkey) const { + // Order by: + // increasing user key (according to user-supplied comparator) + // decreasing sequence number + // decreasing type (though sequence# should be enough to disambiguate) + int r = user_comparator_.Compare(ExtractUserKey(akey), ExtractUserKey(bkey)); + if (r == 0) { + const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8); + const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8); + if (anum > bnum) { + r = -1; + } else if (anum < bnum) { + r = +1; + } + } + return r; +} + +inline int InternalKeyComparator::CompareKeySeq(const Slice& akey, + const Slice& bkey) const { + // Order by: + // increasing user key (according to user-supplied comparator) + // decreasing sequence number + int r = user_comparator_.Compare(ExtractUserKey(akey), ExtractUserKey(bkey)); + if (r == 0) { + // Shift the number to exclude the last byte which contains the value type + const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8) >> 8; + const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8) >> 8; + if (anum > bnum) { + r = -1; + } else if (anum < bnum) { + r = +1; + } + } + return r; +} + +// Wrap InternalKeyComparator as a comparator class for ParsedInternalKey. +struct ParsedInternalKeyComparator { + explicit ParsedInternalKeyComparator(const InternalKeyComparator* c) + : cmp(c) {} + + bool operator()(const ParsedInternalKey& a, + const ParsedInternalKey& b) const { + return cmp->Compare(a, b) < 0; + } + + const InternalKeyComparator* cmp; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/dbformat_test.cc b/src/rocksdb/db/dbformat_test.cc new file mode 100644 index 000000000..a2c67795a --- /dev/null +++ b/src/rocksdb/db/dbformat_test.cc @@ -0,0 +1,207 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/dbformat.h" +#include "logging/logging.h" +#include "test_util/testharness.h" + +namespace ROCKSDB_NAMESPACE { + +static std::string IKey(const std::string& user_key, + uint64_t seq, + ValueType vt) { + std::string encoded; + AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt)); + return encoded; +} + +static std::string Shorten(const std::string& s, const std::string& l) { + std::string result = s; + InternalKeyComparator(BytewiseComparator()).FindShortestSeparator(&result, l); + return result; +} + +static std::string ShortSuccessor(const std::string& s) { + std::string result = s; + InternalKeyComparator(BytewiseComparator()).FindShortSuccessor(&result); + return result; +} + +static void TestKey(const std::string& key, + uint64_t seq, + ValueType vt) { + std::string encoded = IKey(key, seq, vt); + + Slice in(encoded); + ParsedInternalKey decoded("", 0, kTypeValue); + + ASSERT_TRUE(ParseInternalKey(in, &decoded)); + ASSERT_EQ(key, decoded.user_key.ToString()); + ASSERT_EQ(seq, decoded.sequence); + ASSERT_EQ(vt, decoded.type); + + ASSERT_TRUE(!ParseInternalKey(Slice("bar"), &decoded)); +} + +class FormatTest : public testing::Test {}; + +TEST_F(FormatTest, InternalKey_EncodeDecode) { + const char* keys[] = { "", "k", "hello", "longggggggggggggggggggggg" }; + const uint64_t seq[] = { + 1, 2, 3, + (1ull << 8) - 1, 1ull << 8, (1ull << 8) + 1, + (1ull << 16) - 1, 1ull << 16, (1ull << 16) + 1, + (1ull << 32) - 1, 1ull << 32, (1ull << 32) + 1 + }; + for (unsigned int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) { + for (unsigned int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) { + TestKey(keys[k], seq[s], kTypeValue); + TestKey("hello", 1, kTypeDeletion); + } + } +} + +TEST_F(FormatTest, InternalKeyShortSeparator) { + // When user keys are same + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foo", 99, kTypeValue))); + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foo", 101, kTypeValue))); + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foo", 100, kTypeValue))); + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foo", 100, kTypeDeletion))); + + // When user keys are misordered + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("bar", 99, kTypeValue))); + + // When user keys are different, but correctly ordered + ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek), + Shorten(IKey("foo", 100, kTypeValue), + IKey("hello", 200, kTypeValue))); + + ASSERT_EQ(IKey("ABC2", kMaxSequenceNumber, kValueTypeForSeek), + Shorten(IKey("ABC1AAAAA", 100, kTypeValue), + IKey("ABC2ABB", 200, kTypeValue))); + + ASSERT_EQ(IKey("AAA2", kMaxSequenceNumber, kValueTypeForSeek), + Shorten(IKey("AAA1AAA", 100, kTypeValue), + IKey("AAA2AA", 200, kTypeValue))); + + ASSERT_EQ( + IKey("AAA2", kMaxSequenceNumber, kValueTypeForSeek), + Shorten(IKey("AAA1AAA", 100, kTypeValue), IKey("AAA4", 200, kTypeValue))); + + ASSERT_EQ( + IKey("AAA1B", kMaxSequenceNumber, kValueTypeForSeek), + Shorten(IKey("AAA1AAA", 100, kTypeValue), IKey("AAA2", 200, kTypeValue))); + + ASSERT_EQ(IKey("AAA2", kMaxSequenceNumber, kValueTypeForSeek), + Shorten(IKey("AAA1AAA", 100, kTypeValue), + IKey("AAA2A", 200, kTypeValue))); + + ASSERT_EQ( + IKey("AAA1", 100, kTypeValue), + Shorten(IKey("AAA1", 100, kTypeValue), IKey("AAA2", 200, kTypeValue))); + + // When start user key is prefix of limit user key + ASSERT_EQ(IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), + IKey("foobar", 200, kTypeValue))); + + // When limit user key is prefix of start user key + ASSERT_EQ(IKey("foobar", 100, kTypeValue), + Shorten(IKey("foobar", 100, kTypeValue), + IKey("foo", 200, kTypeValue))); +} + +TEST_F(FormatTest, InternalKeyShortestSuccessor) { + ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek), + ShortSuccessor(IKey("foo", 100, kTypeValue))); + ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue), + ShortSuccessor(IKey("\xff\xff", 100, kTypeValue))); +} + +TEST_F(FormatTest, IterKeyOperation) { + IterKey k; + const char p[] = "abcdefghijklmnopqrstuvwxyz"; + const char q[] = "0123456789"; + + ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()), + std::string("")); + + k.TrimAppend(0, p, 3); + ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()), + std::string("abc")); + + k.TrimAppend(1, p, 3); + ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()), + std::string("aabc")); + + k.TrimAppend(0, p, 26); + ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()), + std::string("abcdefghijklmnopqrstuvwxyz")); + + k.TrimAppend(26, q, 10); + ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()), + std::string("abcdefghijklmnopqrstuvwxyz0123456789")); + + k.TrimAppend(36, q, 1); + ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()), + std::string("abcdefghijklmnopqrstuvwxyz01234567890")); + + k.TrimAppend(26, q, 1); + ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()), + std::string("abcdefghijklmnopqrstuvwxyz0")); + + // Size going up, memory allocation is triggered + k.TrimAppend(27, p, 26); + ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()), + std::string("abcdefghijklmnopqrstuvwxyz0" + "abcdefghijklmnopqrstuvwxyz")); +} + +TEST_F(FormatTest, UpdateInternalKey) { + std::string user_key("abcdefghijklmnopqrstuvwxyz"); + uint64_t new_seq = 0x123456; + ValueType new_val_type = kTypeDeletion; + + std::string ikey; + AppendInternalKey(&ikey, ParsedInternalKey(user_key, 100U, kTypeValue)); + size_t ikey_size = ikey.size(); + UpdateInternalKey(&ikey, new_seq, new_val_type); + ASSERT_EQ(ikey_size, ikey.size()); + + Slice in(ikey); + ParsedInternalKey decoded; + ASSERT_TRUE(ParseInternalKey(in, &decoded)); + ASSERT_EQ(user_key, decoded.user_key.ToString()); + ASSERT_EQ(new_seq, decoded.sequence); + ASSERT_EQ(new_val_type, decoded.type); +} + +TEST_F(FormatTest, RangeTombstoneSerializeEndKey) { + RangeTombstone t("a", "b", 2); + InternalKey k("b", 3, kTypeValue); + const InternalKeyComparator cmp(BytewiseComparator()); + ASSERT_LT(cmp.Compare(t.SerializeEndKey(), k), 0); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/deletefile_test.cc b/src/rocksdb/db/deletefile_test.cc new file mode 100644 index 000000000..f202388c0 --- /dev/null +++ b/src/rocksdb/db/deletefile_test.cc @@ -0,0 +1,571 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include +#include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "file/filename.h" +#include "port/stack_trace.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/transaction_log.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +class DeleteFileTest : public DBTestBase { + public: + const int numlevels_; + const std::string wal_dir_; + + DeleteFileTest() + : DBTestBase("/deletefile_test"), + numlevels_(7), + wal_dir_(dbname_ + "/wal_files") {} + + void SetOptions(Options* options) { + assert(options); + options->delete_obsolete_files_period_micros = 0; // always do full purge + options->enable_thread_tracking = true; + options->write_buffer_size = 1024 * 1024 * 1000; + options->target_file_size_base = 1024 * 1024 * 1000; + options->max_bytes_for_level_base = 1024 * 1024 * 1000; + options->WAL_ttl_seconds = 300; // Used to test log files + options->WAL_size_limit_MB = 1024; // Used to test log files + options->wal_dir = wal_dir_; + } + + void AddKeys(int numkeys, int startkey = 0) { + WriteOptions options; + options.sync = false; + ReadOptions roptions; + for (int i = startkey; i < (numkeys + startkey) ; i++) { + std::string temp = ToString(i); + Slice key(temp); + Slice value(temp); + ASSERT_OK(db_->Put(options, key, value)); + } + } + + int numKeysInLevels( + std::vector &metadata, + std::vector *keysperlevel = nullptr) { + + if (keysperlevel != nullptr) { + keysperlevel->resize(numlevels_); + } + + int numKeys = 0; + for (size_t i = 0; i < metadata.size(); i++) { + int startkey = atoi(metadata[i].smallestkey.c_str()); + int endkey = atoi(metadata[i].largestkey.c_str()); + int numkeysinfile = (endkey - startkey + 1); + numKeys += numkeysinfile; + if (keysperlevel != nullptr) { + (*keysperlevel)[(int)metadata[i].level] += numkeysinfile; + } + fprintf(stderr, "level %d name %s smallest %s largest %s\n", + metadata[i].level, metadata[i].name.c_str(), + metadata[i].smallestkey.c_str(), + metadata[i].largestkey.c_str()); + } + return numKeys; + } + + void CreateTwoLevels() { + AddKeys(50000, 10000); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + for (int i = 0; i < 2; ++i) { + ASSERT_OK(dbfull()->TEST_CompactRange(i, nullptr, nullptr)); + } + + AddKeys(50000, 10000); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr)); + } + + void CheckFileTypeCounts(const std::string& dir, int required_log, + int required_sst, int required_manifest) { + std::vector filenames; + env_->GetChildren(dir, &filenames); + + int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0; + for (auto file : filenames) { + uint64_t number; + FileType type; + if (ParseFileName(file, &number, &type)) { + log_cnt += (type == kLogFile); + sst_cnt += (type == kTableFile); + manifest_cnt += (type == kDescriptorFile); + } + } + ASSERT_EQ(required_log, log_cnt); + ASSERT_EQ(required_sst, sst_cnt); + ASSERT_EQ(required_manifest, manifest_cnt); + } + + static void DoSleep(void* arg) { + auto test = reinterpret_cast(arg); + test->env_->SleepForMicroseconds(2 * 1000 * 1000); + } + + // An empty job to guard all jobs are processed + static void GuardFinish(void* /*arg*/) { + TEST_SYNC_POINT("DeleteFileTest::GuardFinish"); + } +}; + +TEST_F(DeleteFileTest, AddKeysAndQueryLevels) { + Options options = CurrentOptions(); + SetOptions(&options); + Destroy(options); + options.create_if_missing = true; + Reopen(options); + + CreateTwoLevels(); + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + + std::string level1file = ""; + int level1keycount = 0; + std::string level2file = ""; + int level2keycount = 0; + int level1index = 0; + int level2index = 1; + + ASSERT_EQ((int)metadata.size(), 2); + if (metadata[0].level == 2) { + level1index = 1; + level2index = 0; + } + + level1file = metadata[level1index].name; + int startkey = atoi(metadata[level1index].smallestkey.c_str()); + int endkey = atoi(metadata[level1index].largestkey.c_str()); + level1keycount = (endkey - startkey + 1); + level2file = metadata[level2index].name; + startkey = atoi(metadata[level2index].smallestkey.c_str()); + endkey = atoi(metadata[level2index].largestkey.c_str()); + level2keycount = (endkey - startkey + 1); + + // COntrolled setup. Levels 1 and 2 should both have 50K files. + // This is a little fragile as it depends on the current + // compaction heuristics. + ASSERT_EQ(level1keycount, 50000); + ASSERT_EQ(level2keycount, 50000); + + Status status = db_->DeleteFile("0.sst"); + ASSERT_TRUE(status.IsInvalidArgument()); + + // intermediate level files cannot be deleted. + status = db_->DeleteFile(level1file); + ASSERT_TRUE(status.IsInvalidArgument()); + + // Lowest level file deletion should succeed. + ASSERT_OK(db_->DeleteFile(level2file)); +} + +TEST_F(DeleteFileTest, PurgeObsoleteFilesTest) { + Options options = CurrentOptions(); + SetOptions(&options); + Destroy(options); + options.create_if_missing = true; + Reopen(options); + + CreateTwoLevels(); + // there should be only one (empty) log file because CreateTwoLevels() + // flushes the memtables to disk + CheckFileTypeCounts(wal_dir_, 1, 0, 0); + // 2 ssts, 1 manifest + CheckFileTypeCounts(dbname_, 0, 2, 1); + std::string first("0"), last("999999"); + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 2; + Slice first_slice(first), last_slice(last); + db_->CompactRange(compact_options, &first_slice, &last_slice); + // 1 sst after compaction + CheckFileTypeCounts(dbname_, 0, 1, 1); + + // this time, we keep an iterator alive + Reopen(options); + Iterator *itr = nullptr; + CreateTwoLevels(); + itr = db_->NewIterator(ReadOptions()); + db_->CompactRange(compact_options, &first_slice, &last_slice); + // 3 sst after compaction with live iterator + CheckFileTypeCounts(dbname_, 0, 3, 1); + delete itr; + // 1 sst after iterator deletion + CheckFileTypeCounts(dbname_, 0, 1, 1); +} + +TEST_F(DeleteFileTest, BackgroundPurgeIteratorTest) { + Options options = CurrentOptions(); + SetOptions(&options); + Destroy(options); + options.create_if_missing = true; + Reopen(options); + + std::string first("0"), last("999999"); + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 2; + Slice first_slice(first), last_slice(last); + + // We keep an iterator alive + Iterator* itr = nullptr; + CreateTwoLevels(); + ReadOptions read_options; + read_options.background_purge_on_iterator_cleanup = true; + itr = db_->NewIterator(read_options); + db_->CompactRange(compact_options, &first_slice, &last_slice); + // 3 sst after compaction with live iterator + CheckFileTypeCounts(dbname_, 0, 3, 1); + test::SleepingBackgroundTask sleeping_task_before; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_task_before, Env::Priority::HIGH); + delete itr; + test::SleepingBackgroundTask sleeping_task_after; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_task_after, Env::Priority::HIGH); + + // Make sure no purges are executed foreground + CheckFileTypeCounts(dbname_, 0, 3, 1); + sleeping_task_before.WakeUp(); + sleeping_task_before.WaitUntilDone(); + + // Make sure all background purges are executed + sleeping_task_after.WakeUp(); + sleeping_task_after.WaitUntilDone(); + // 1 sst after iterator deletion + CheckFileTypeCounts(dbname_, 0, 1, 1); +} + +TEST_F(DeleteFileTest, BackgroundPurgeCFDropTest) { + Options options = CurrentOptions(); + SetOptions(&options); + Destroy(options); + options.create_if_missing = true; + Reopen(options); + + auto do_test = [&](bool bg_purge) { + ColumnFamilyOptions co; + co.max_write_buffer_size_to_maintain = + static_cast(co.write_buffer_size); + WriteOptions wo; + FlushOptions fo; + ColumnFamilyHandle* cfh = nullptr; + + ASSERT_OK(db_->CreateColumnFamily(co, "dropme", &cfh)); + + ASSERT_OK(db_->Put(wo, cfh, "pika", "chu")); + ASSERT_OK(db_->Flush(fo, cfh)); + // Expect 1 sst file. + CheckFileTypeCounts(dbname_, 0, 1, 1); + + ASSERT_OK(db_->DropColumnFamily(cfh)); + // Still 1 file, it won't be deleted while ColumnFamilyHandle is alive. + CheckFileTypeCounts(dbname_, 0, 1, 1); + + delete cfh; + test::SleepingBackgroundTask sleeping_task_after; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_task_after, Env::Priority::HIGH); + // If background purge is enabled, the file should still be there. + CheckFileTypeCounts(dbname_, 0, bg_purge ? 1 : 0, 1); + TEST_SYNC_POINT("DeleteFileTest::BackgroundPurgeCFDropTest:1"); + + // Execute background purges. + sleeping_task_after.WakeUp(); + sleeping_task_after.WaitUntilDone(); + // The file should have been deleted. + CheckFileTypeCounts(dbname_, 0, 0, 1); + }; + + { + SCOPED_TRACE("avoid_unnecessary_blocking_io = false"); + do_test(false); + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->LoadDependency( + {{"DeleteFileTest::BackgroundPurgeCFDropTest:1", + "DBImpl::BGWorkPurge:start"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + options.avoid_unnecessary_blocking_io = true; + options.create_if_missing = false; + Reopen(options); + { + SCOPED_TRACE("avoid_unnecessary_blocking_io = true"); + do_test(true); + } +} + +// This test is to reproduce a bug that read invalid ReadOption in iterator +// cleanup function +TEST_F(DeleteFileTest, BackgroundPurgeCopyOptions) { + Options options = CurrentOptions(); + SetOptions(&options); + Destroy(options); + options.create_if_missing = true; + Reopen(options); + + std::string first("0"), last("999999"); + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 2; + Slice first_slice(first), last_slice(last); + + // We keep an iterator alive + Iterator* itr = nullptr; + CreateTwoLevels(); + { + ReadOptions read_options; + read_options.background_purge_on_iterator_cleanup = true; + itr = db_->NewIterator(read_options); + // ReadOptions is deleted, but iterator cleanup function should not be + // affected + } + + db_->CompactRange(compact_options, &first_slice, &last_slice); + // 3 sst after compaction with live iterator + CheckFileTypeCounts(dbname_, 0, 3, 1); + delete itr; + + test::SleepingBackgroundTask sleeping_task_after; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_task_after, Env::Priority::HIGH); + + // Make sure all background purges are executed + sleeping_task_after.WakeUp(); + sleeping_task_after.WaitUntilDone(); + // 1 sst after iterator deletion + CheckFileTypeCounts(dbname_, 0, 1, 1); +} + +TEST_F(DeleteFileTest, BackgroundPurgeTestMultipleJobs) { + Options options = CurrentOptions(); + SetOptions(&options); + Destroy(options); + options.create_if_missing = true; + Reopen(options); + + std::string first("0"), last("999999"); + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 2; + Slice first_slice(first), last_slice(last); + + // We keep an iterator alive + CreateTwoLevels(); + ReadOptions read_options; + read_options.background_purge_on_iterator_cleanup = true; + Iterator* itr1 = db_->NewIterator(read_options); + CreateTwoLevels(); + Iterator* itr2 = db_->NewIterator(read_options); + db_->CompactRange(compact_options, &first_slice, &last_slice); + // 5 sst files after 2 compactions with 2 live iterators + CheckFileTypeCounts(dbname_, 0, 5, 1); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + // ~DBImpl should wait until all BGWorkPurge are finished + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::~DBImpl:WaitJob", "DBImpl::BGWorkPurge"}, + {"DeleteFileTest::GuardFinish", + "DeleteFileTest::BackgroundPurgeTestMultipleJobs:DBClose"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + delete itr1; + env_->Schedule(&DeleteFileTest::DoSleep, this, Env::Priority::HIGH); + delete itr2; + env_->Schedule(&DeleteFileTest::GuardFinish, nullptr, Env::Priority::HIGH); + Close(); + + TEST_SYNC_POINT("DeleteFileTest::BackgroundPurgeTestMultipleJobs:DBClose"); + // 1 sst after iterator deletion + CheckFileTypeCounts(dbname_, 0, 1, 1); +} + +TEST_F(DeleteFileTest, DeleteFileWithIterator) { + Options options = CurrentOptions(); + SetOptions(&options); + Destroy(options); + options.create_if_missing = true; + Reopen(options); + + CreateTwoLevels(); + ReadOptions read_options; + Iterator* it = db_->NewIterator(read_options); + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + + std::string level2file; + + ASSERT_EQ(metadata.size(), static_cast(2)); + if (metadata[0].level == 1) { + level2file = metadata[1].name; + } else { + level2file = metadata[0].name; + } + + Status status = db_->DeleteFile(level2file); + fprintf(stdout, "Deletion status %s: %s\n", + level2file.c_str(), status.ToString().c_str()); + ASSERT_TRUE(status.ok()); + it->SeekToFirst(); + int numKeysIterated = 0; + while(it->Valid()) { + numKeysIterated++; + it->Next(); + } + ASSERT_EQ(numKeysIterated, 50000); + delete it; +} + +TEST_F(DeleteFileTest, DeleteLogFiles) { + Options options = CurrentOptions(); + SetOptions(&options); + Destroy(options); + options.create_if_missing = true; + Reopen(options); + + AddKeys(10, 0); + VectorLogPtr logfiles; + db_->GetSortedWalFiles(logfiles); + ASSERT_GT(logfiles.size(), 0UL); + // Take the last log file which is expected to be alive and try to delete it + // Should not succeed because live logs are not allowed to be deleted + std::unique_ptr alive_log = std::move(logfiles.back()); + ASSERT_EQ(alive_log->Type(), kAliveLogFile); + ASSERT_OK(env_->FileExists(wal_dir_ + "/" + alive_log->PathName())); + fprintf(stdout, "Deleting alive log file %s\n", + alive_log->PathName().c_str()); + ASSERT_TRUE(!db_->DeleteFile(alive_log->PathName()).ok()); + ASSERT_OK(env_->FileExists(wal_dir_ + "/" + alive_log->PathName())); + logfiles.clear(); + + // Call Flush to bring about a new working log file and add more keys + // Call Flush again to flush out memtable and move alive log to archived log + // and try to delete the archived log file + FlushOptions fopts; + db_->Flush(fopts); + AddKeys(10, 0); + db_->Flush(fopts); + db_->GetSortedWalFiles(logfiles); + ASSERT_GT(logfiles.size(), 0UL); + std::unique_ptr archived_log = std::move(logfiles.front()); + ASSERT_EQ(archived_log->Type(), kArchivedLogFile); + ASSERT_OK(env_->FileExists(wal_dir_ + "/" + archived_log->PathName())); + fprintf(stdout, "Deleting archived log file %s\n", + archived_log->PathName().c_str()); + ASSERT_OK(db_->DeleteFile(archived_log->PathName())); + ASSERT_EQ(Status::NotFound(), + env_->FileExists(wal_dir_ + "/" + archived_log->PathName())); +} + +TEST_F(DeleteFileTest, DeleteNonDefaultColumnFamily) { + Options options = CurrentOptions(); + SetOptions(&options); + Destroy(options); + options.create_if_missing = true; + Reopen(options); + CreateAndReopenWithCF({"new_cf"}, options); + + Random rnd(5); + for (int i = 0; i < 1000; ++i) { + ASSERT_OK(db_->Put(WriteOptions(), handles_[1], test::RandomKey(&rnd, 10), + test::RandomKey(&rnd, 10))); + } + ASSERT_OK(db_->Flush(FlushOptions(), handles_[1])); + for (int i = 0; i < 1000; ++i) { + ASSERT_OK(db_->Put(WriteOptions(), handles_[1], test::RandomKey(&rnd, 10), + test::RandomKey(&rnd, 10))); + } + ASSERT_OK(db_->Flush(FlushOptions(), handles_[1])); + + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(2U, metadata.size()); + ASSERT_EQ("new_cf", metadata[0].column_family_name); + ASSERT_EQ("new_cf", metadata[1].column_family_name); + auto old_file = metadata[0].smallest_seqno < metadata[1].smallest_seqno + ? metadata[0].name + : metadata[1].name; + auto new_file = metadata[0].smallest_seqno > metadata[1].smallest_seqno + ? metadata[0].name + : metadata[1].name; + ASSERT_TRUE(db_->DeleteFile(new_file).IsInvalidArgument()); + ASSERT_OK(db_->DeleteFile(old_file)); + + { + std::unique_ptr itr(db_->NewIterator(ReadOptions(), handles_[1])); + int count = 0; + for (itr->SeekToFirst(); itr->Valid(); itr->Next()) { + ASSERT_OK(itr->status()); + ++count; + } + ASSERT_EQ(count, 1000); + } + + Close(); + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "new_cf"}, options); + + { + std::unique_ptr itr(db_->NewIterator(ReadOptions(), handles_[1])); + int count = 0; + for (itr->SeekToFirst(); itr->Valid(); itr->Next()) { + ASSERT_OK(itr->status()); + ++count; + } + ASSERT_EQ(count, 1000); + } +} + +} // namespace ROCKSDB_NAMESPACE + +#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS +extern "C" { +void RegisterCustomObjects(int argc, char** argv); +} +#else +void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {} +#endif // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, + "SKIPPED as DBImpl::DeleteFile is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/error_handler.cc b/src/rocksdb/db/error_handler.cc new file mode 100644 index 000000000..3ba4d9fd9 --- /dev/null +++ b/src/rocksdb/db/error_handler.cc @@ -0,0 +1,344 @@ +// Copyright (c) 2018-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "db/error_handler.h" +#include "db/db_impl/db_impl.h" +#include "db/event_helpers.h" +#include "file/sst_file_manager_impl.h" + +namespace ROCKSDB_NAMESPACE { + +// Maps to help decide the severity of an error based on the +// BackgroundErrorReason, Code, SubCode and whether db_options.paranoid_checks +// is set or not. There are 3 maps, going from most specific to least specific +// (i.e from all 4 fields in a tuple to only the BackgroundErrorReason and +// paranoid_checks). The less specific map serves as a catch all in case we miss +// a specific error code or subcode. +std::map, + Status::Severity> + ErrorSeverityMap = { + // Errors during BG compaction + {std::make_tuple(BackgroundErrorReason::kCompaction, + Status::Code::kIOError, Status::SubCode::kNoSpace, + true), + Status::Severity::kSoftError}, + {std::make_tuple(BackgroundErrorReason::kCompaction, + Status::Code::kIOError, Status::SubCode::kNoSpace, + false), + Status::Severity::kNoError}, + {std::make_tuple(BackgroundErrorReason::kCompaction, + Status::Code::kIOError, Status::SubCode::kSpaceLimit, + true), + Status::Severity::kHardError}, + // Errors during BG flush + {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError, + Status::SubCode::kNoSpace, true), + Status::Severity::kHardError}, + {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError, + Status::SubCode::kNoSpace, false), + Status::Severity::kNoError}, + {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError, + Status::SubCode::kSpaceLimit, true), + Status::Severity::kHardError}, + // Errors during Write + {std::make_tuple(BackgroundErrorReason::kWriteCallback, + Status::Code::kIOError, Status::SubCode::kNoSpace, + true), + Status::Severity::kHardError}, + {std::make_tuple(BackgroundErrorReason::kWriteCallback, + Status::Code::kIOError, Status::SubCode::kNoSpace, + false), + Status::Severity::kHardError}, +}; + +std::map, Status::Severity> + DefaultErrorSeverityMap = { + // Errors during BG compaction + {std::make_tuple(BackgroundErrorReason::kCompaction, + Status::Code::kCorruption, true), + Status::Severity::kUnrecoverableError}, + {std::make_tuple(BackgroundErrorReason::kCompaction, + Status::Code::kCorruption, false), + Status::Severity::kNoError}, + {std::make_tuple(BackgroundErrorReason::kCompaction, + Status::Code::kIOError, true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kCompaction, + Status::Code::kIOError, false), + Status::Severity::kNoError}, + // Errors during BG flush + {std::make_tuple(BackgroundErrorReason::kFlush, + Status::Code::kCorruption, true), + Status::Severity::kUnrecoverableError}, + {std::make_tuple(BackgroundErrorReason::kFlush, + Status::Code::kCorruption, false), + Status::Severity::kNoError}, + {std::make_tuple(BackgroundErrorReason::kFlush, + Status::Code::kIOError, true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kFlush, + Status::Code::kIOError, false), + Status::Severity::kNoError}, + // Errors during Write + {std::make_tuple(BackgroundErrorReason::kWriteCallback, + Status::Code::kCorruption, true), + Status::Severity::kUnrecoverableError}, + {std::make_tuple(BackgroundErrorReason::kWriteCallback, + Status::Code::kCorruption, false), + Status::Severity::kNoError}, + {std::make_tuple(BackgroundErrorReason::kWriteCallback, + Status::Code::kIOError, true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kWriteCallback, + Status::Code::kIOError, false), + Status::Severity::kNoError}, +}; + +std::map, Status::Severity> + DefaultReasonMap = { + // Errors during BG compaction + {std::make_tuple(BackgroundErrorReason::kCompaction, true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kCompaction, false), + Status::Severity::kNoError}, + // Errors during BG flush + {std::make_tuple(BackgroundErrorReason::kFlush, true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kFlush, false), + Status::Severity::kNoError}, + // Errors during Write + {std::make_tuple(BackgroundErrorReason::kWriteCallback, true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kWriteCallback, false), + Status::Severity::kFatalError}, + // Errors during Memtable update + {std::make_tuple(BackgroundErrorReason::kMemTable, true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kMemTable, false), + Status::Severity::kFatalError}, +}; + +void ErrorHandler::CancelErrorRecovery() { +#ifndef ROCKSDB_LITE + db_mutex_->AssertHeld(); + + // We'll release the lock before calling sfm, so make sure no new + // recovery gets scheduled at that point + auto_recovery_ = false; + SstFileManagerImpl* sfm = reinterpret_cast( + db_options_.sst_file_manager.get()); + if (sfm) { + // This may or may not cancel a pending recovery + db_mutex_->Unlock(); + bool cancelled = sfm->CancelErrorRecovery(this); + db_mutex_->Lock(); + if (cancelled) { + recovery_in_prog_ = false; + } + } +#endif +} + +// This is the main function for looking at an error during a background +// operation and deciding the severity, and error recovery strategy. The high +// level algorithm is as follows - +// 1. Classify the severity of the error based on the ErrorSeverityMap, +// DefaultErrorSeverityMap and DefaultReasonMap defined earlier +// 2. Call a Status code specific override function to adjust the severity +// if needed. The reason for this is our ability to recover may depend on +// the exact options enabled in DBOptions +// 3. Determine if auto recovery is possible. A listener notification callback +// is called, which can disable the auto recovery even if we decide its +// feasible +// 4. For Status::NoSpace() errors, rely on SstFileManagerImpl to control +// the actual recovery. If no sst file manager is specified in DBOptions, +// a default one is allocated during DB::Open(), so there will always be +// one. +// This can also get called as part of a recovery operation. In that case, we +// also track the error separately in recovery_error_ so we can tell in the +// end whether recovery succeeded or not +Status ErrorHandler::SetBGError(const Status& bg_err, BackgroundErrorReason reason) { + db_mutex_->AssertHeld(); + + if (bg_err.ok()) { + return Status::OK(); + } + + bool paranoid = db_options_.paranoid_checks; + Status::Severity sev = Status::Severity::kFatalError; + Status new_bg_err; + bool found = false; + + { + auto entry = ErrorSeverityMap.find(std::make_tuple(reason, bg_err.code(), + bg_err.subcode(), paranoid)); + if (entry != ErrorSeverityMap.end()) { + sev = entry->second; + found = true; + } + } + + if (!found) { + auto entry = DefaultErrorSeverityMap.find(std::make_tuple(reason, + bg_err.code(), paranoid)); + if (entry != DefaultErrorSeverityMap.end()) { + sev = entry->second; + found = true; + } + } + + if (!found) { + auto entry = DefaultReasonMap.find(std::make_tuple(reason, paranoid)); + if (entry != DefaultReasonMap.end()) { + sev = entry->second; + } + } + + new_bg_err = Status(bg_err, sev); + + // Check if recovery is currently in progress. If it is, we will save this + // error so we can check it at the end to see if recovery succeeded or not + if (recovery_in_prog_ && recovery_error_.ok()) { + recovery_error_ = new_bg_err; + } + + bool auto_recovery = auto_recovery_; + if (new_bg_err.severity() >= Status::Severity::kFatalError && auto_recovery) { + auto_recovery = false; + } + + // Allow some error specific overrides + if (new_bg_err == Status::NoSpace()) { + new_bg_err = OverrideNoSpaceError(new_bg_err, &auto_recovery); + } + + if (!new_bg_err.ok()) { + Status s = new_bg_err; + EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, &s, + db_mutex_, &auto_recovery); + if (!s.ok() && (s.severity() > bg_error_.severity())) { + bg_error_ = s; + } else { + // This error is less severe than previously encountered error. Don't + // take any further action + return bg_error_; + } + } + + if (auto_recovery) { + recovery_in_prog_ = true; + + // Kick-off error specific recovery + if (bg_error_ == Status::NoSpace()) { + RecoverFromNoSpace(); + } + } + return bg_error_; +} + +Status ErrorHandler::OverrideNoSpaceError(Status bg_error, + bool* auto_recovery) { +#ifndef ROCKSDB_LITE + if (bg_error.severity() >= Status::Severity::kFatalError) { + return bg_error; + } + + if (db_options_.sst_file_manager.get() == nullptr) { + // We rely on SFM to poll for enough disk space and recover + *auto_recovery = false; + return bg_error; + } + + if (db_options_.allow_2pc && + (bg_error.severity() <= Status::Severity::kSoftError)) { + // Don't know how to recover, as the contents of the current WAL file may + // be inconsistent, and it may be needed for 2PC. If 2PC is not enabled, + // we can just flush the memtable and discard the log + *auto_recovery = false; + return Status(bg_error, Status::Severity::kFatalError); + } + + { + uint64_t free_space; + if (db_options_.env->GetFreeSpace(db_options_.db_paths[0].path, + &free_space) == Status::NotSupported()) { + *auto_recovery = false; + } + } + + return bg_error; +#else + (void)auto_recovery; + return Status(bg_error, Status::Severity::kFatalError); +#endif +} + +void ErrorHandler::RecoverFromNoSpace() { +#ifndef ROCKSDB_LITE + SstFileManagerImpl* sfm = + reinterpret_cast(db_options_.sst_file_manager.get()); + + // Inform SFM of the error, so it can kick-off the recovery + if (sfm) { + sfm->StartErrorRecovery(this, bg_error_); + } +#endif +} + +Status ErrorHandler::ClearBGError() { +#ifndef ROCKSDB_LITE + db_mutex_->AssertHeld(); + + // Signal that recovery succeeded + if (recovery_error_.ok()) { + Status old_bg_error = bg_error_; + bg_error_ = Status::OK(); + recovery_in_prog_ = false; + EventHelpers::NotifyOnErrorRecoveryCompleted(db_options_.listeners, + old_bg_error, db_mutex_); + } + return recovery_error_; +#else + return bg_error_; +#endif +} + +Status ErrorHandler::RecoverFromBGError(bool is_manual) { +#ifndef ROCKSDB_LITE + InstrumentedMutexLock l(db_mutex_); + if (is_manual) { + // If its a manual recovery and there's a background recovery in progress + // return busy status + if (recovery_in_prog_) { + return Status::Busy(); + } + recovery_in_prog_ = true; + } + + if (bg_error_.severity() == Status::Severity::kSoftError) { + // Simply clear the background error and return + recovery_error_ = Status::OK(); + return ClearBGError(); + } + + // Reset recovery_error_. We will use this to record any errors that happen + // during the recovery process. While recovering, the only operations that + // can generate background errors should be the flush operations + recovery_error_ = Status::OK(); + Status s = db_->ResumeImpl(); + // For manual recover, shutdown, and fatal error cases, set + // recovery_in_prog_ to false. For automatic background recovery, leave it + // as is regardless of success or failure as it will be retried + if (is_manual || s.IsShutdownInProgress() || + bg_error_.severity() >= Status::Severity::kFatalError) { + recovery_in_prog_ = false; + } + return s; +#else + (void)is_manual; + return bg_error_; +#endif +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/error_handler.h b/src/rocksdb/db/error_handler.h new file mode 100644 index 000000000..7276f6510 --- /dev/null +++ b/src/rocksdb/db/error_handler.h @@ -0,0 +1,75 @@ +// Copyright (c) 2018-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include "monitoring/instrumented_mutex.h" +#include "options/db_options.h" +#include "rocksdb/listener.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class DBImpl; + +class ErrorHandler { + public: + ErrorHandler(DBImpl* db, const ImmutableDBOptions& db_options, + InstrumentedMutex* db_mutex) + : db_(db), + db_options_(db_options), + bg_error_(Status::OK()), + recovery_error_(Status::OK()), + db_mutex_(db_mutex), + auto_recovery_(false), + recovery_in_prog_(false) {} + ~ErrorHandler() {} + + void EnableAutoRecovery() { auto_recovery_ = true; } + + Status::Severity GetErrorSeverity(BackgroundErrorReason reason, + Status::Code code, + Status::SubCode subcode); + + Status SetBGError(const Status& bg_err, BackgroundErrorReason reason); + + Status GetBGError() { return bg_error_; } + + Status GetRecoveryError() { return recovery_error_; } + + Status ClearBGError(); + + bool IsDBStopped() { + return !bg_error_.ok() && + bg_error_.severity() >= Status::Severity::kHardError; + } + + bool IsBGWorkStopped() { + return !bg_error_.ok() && + (bg_error_.severity() >= Status::Severity::kHardError || + !auto_recovery_); + } + + bool IsRecoveryInProgress() { return recovery_in_prog_; } + + Status RecoverFromBGError(bool is_manual = false); + void CancelErrorRecovery(); + + private: + DBImpl* db_; + const ImmutableDBOptions& db_options_; + Status bg_error_; + // A separate Status variable used to record any errors during the + // recovery process from hard errors + Status recovery_error_; + InstrumentedMutex* db_mutex_; + // A flag indicating whether automatic recovery from errors is enabled + bool auto_recovery_; + bool recovery_in_prog_; + + Status OverrideNoSpaceError(Status bg_error, bool* auto_recovery); + void RecoverFromNoSpace(); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/error_handler_test.cc b/src/rocksdb/db/error_handler_test.cc new file mode 100644 index 000000000..b9d78490c --- /dev/null +++ b/src/rocksdb/db/error_handler_test.cc @@ -0,0 +1,871 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#ifndef ROCKSDB_LITE + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/sst_file_manager.h" +#include "test_util/fault_injection_test_env.h" +#if !defined(ROCKSDB_LITE) +#include "test_util/sync_point.h" +#endif + +namespace ROCKSDB_NAMESPACE { + +class DBErrorHandlingTest : public DBTestBase { + public: + DBErrorHandlingTest() : DBTestBase("/db_error_handling_test") {} + + std::string GetManifestNameFromLiveFiles() { + std::vector live_files; + uint64_t manifest_size; + + dbfull()->GetLiveFiles(live_files, &manifest_size, false); + for (auto& file : live_files) { + uint64_t num = 0; + FileType type; + if (ParseFileName(file, &num, &type) && type == kDescriptorFile) { + return file; + } + } + return ""; + } +}; + +class DBErrorHandlingEnv : public EnvWrapper { + public: + DBErrorHandlingEnv() : EnvWrapper(Env::Default()), + trig_no_space(false), trig_io_error(false) {} + + void SetTrigNoSpace() {trig_no_space = true;} + void SetTrigIoError() {trig_io_error = true;} + private: + bool trig_no_space; + bool trig_io_error; +}; + +class ErrorHandlerListener : public EventListener { + public: + ErrorHandlerListener() + : mutex_(), + cv_(&mutex_), + no_auto_recovery_(false), + recovery_complete_(false), + file_creation_started_(false), + override_bg_error_(false), + file_count_(0), + fault_env_(nullptr) {} + + void OnTableFileCreationStarted( + const TableFileCreationBriefInfo& /*ti*/) override { + InstrumentedMutexLock l(&mutex_); + file_creation_started_ = true; + if (file_count_ > 0) { + if (--file_count_ == 0) { + fault_env_->SetFilesystemActive(false, file_creation_error_); + file_creation_error_ = Status::OK(); + } + } + cv_.SignalAll(); + } + + void OnErrorRecoveryBegin(BackgroundErrorReason /*reason*/, + Status /*bg_error*/, + bool* auto_recovery) override { + if (*auto_recovery && no_auto_recovery_) { + *auto_recovery = false; + } + } + + void OnErrorRecoveryCompleted(Status /*old_bg_error*/) override { + InstrumentedMutexLock l(&mutex_); + recovery_complete_ = true; + cv_.SignalAll(); + } + + bool WaitForRecovery(uint64_t /*abs_time_us*/) { + InstrumentedMutexLock l(&mutex_); + while (!recovery_complete_) { + cv_.Wait(/*abs_time_us*/); + } + if (recovery_complete_) { + recovery_complete_ = false; + return true; + } + return false; + } + + void WaitForTableFileCreationStarted(uint64_t /*abs_time_us*/) { + InstrumentedMutexLock l(&mutex_); + while (!file_creation_started_) { + cv_.Wait(/*abs_time_us*/); + } + file_creation_started_ = false; + } + + void OnBackgroundError(BackgroundErrorReason /*reason*/, + Status* bg_error) override { + if (override_bg_error_) { + *bg_error = bg_error_; + override_bg_error_ = false; + } + } + + void EnableAutoRecovery(bool enable = true) { no_auto_recovery_ = !enable; } + + void OverrideBGError(Status bg_err) { + bg_error_ = bg_err; + override_bg_error_ = true; + } + + void InjectFileCreationError(FaultInjectionTestEnv* env, int file_count, + Status s) { + fault_env_ = env; + file_count_ = file_count; + file_creation_error_ = s; + } + + private: + InstrumentedMutex mutex_; + InstrumentedCondVar cv_; + bool no_auto_recovery_; + bool recovery_complete_; + bool file_creation_started_; + bool override_bg_error_; + int file_count_; + Status file_creation_error_; + Status bg_error_; + FaultInjectionTestEnv* fault_env_; +}; + +TEST_F(DBErrorHandlingTest, FLushWriteError) { + std::unique_ptr fault_env( + new FaultInjectionTestEnv(Env::Default())); + std::shared_ptr listener(new ErrorHandlerListener()); + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.env = fault_env.get(); + options.listeners.emplace_back(listener); + Status s; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + + Put(Key(0), "val"); + SyncPoint::GetInstance()->SetCallBack( + "FlushJob::Start", [&](void *) { + fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space")); + }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_env->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_EQ(s, Status::OK()); + + Reopen(options); + ASSERT_EQ("val", Get(Key(0))); + Destroy(options); +} + +TEST_F(DBErrorHandlingTest, ManifestWriteError) { + std::unique_ptr fault_env( + new FaultInjectionTestEnv(Env::Default())); + std::shared_ptr listener(new ErrorHandlerListener()); + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.env = fault_env.get(); + options.listeners.emplace_back(listener); + Status s; + std::string old_manifest; + std::string new_manifest; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + old_manifest = GetManifestNameFromLiveFiles(); + + Put(Key(0), "val"); + Flush(); + Put(Key(1), "val"); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", [&](void *) { + fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space")); + }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + fault_env->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_EQ(s, Status::OK()); + + new_manifest = GetManifestNameFromLiveFiles(); + ASSERT_NE(new_manifest, old_manifest); + + Reopen(options); + ASSERT_EQ("val", Get(Key(0))); + ASSERT_EQ("val", Get(Key(1))); + Close(); +} + +TEST_F(DBErrorHandlingTest, DoubleManifestWriteError) { + std::unique_ptr fault_env( + new FaultInjectionTestEnv(Env::Default())); + std::shared_ptr listener(new ErrorHandlerListener()); + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.env = fault_env.get(); + options.listeners.emplace_back(listener); + Status s; + std::string old_manifest; + std::string new_manifest; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + old_manifest = GetManifestNameFromLiveFiles(); + + Put(Key(0), "val"); + Flush(); + Put(Key(1), "val"); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", [&](void *) { + fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space")); + }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + fault_env->SetFilesystemActive(true); + + // This Resume() will attempt to create a new manifest file and fail again + s = dbfull()->Resume(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + fault_env->SetFilesystemActive(true); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + + // A successful Resume() will create a new manifest file + s = dbfull()->Resume(); + ASSERT_EQ(s, Status::OK()); + + new_manifest = GetManifestNameFromLiveFiles(); + ASSERT_NE(new_manifest, old_manifest); + + Reopen(options); + ASSERT_EQ("val", Get(Key(0))); + ASSERT_EQ("val", Get(Key(1))); + Close(); +} + +TEST_F(DBErrorHandlingTest, CompactionManifestWriteError) { + std::unique_ptr fault_env( + new FaultInjectionTestEnv(Env::Default())); + std::shared_ptr listener(new ErrorHandlerListener()); + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = 2; + options.listeners.emplace_back(listener); + options.env = fault_env.get(); + Status s; + std::string old_manifest; + std::string new_manifest; + std::atomic fail_manifest(false); + DestroyAndReopen(options); + old_manifest = GetManifestNameFromLiveFiles(); + + Put(Key(0), "val"); + Put(Key(2), "val"); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + // Wait for flush of 2nd L0 file before starting compaction + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}, + // Wait for compaction to detect manifest write error + {"BackgroundCallCompaction:1", "CompactionManifestWriteError:0"}, + // Make compaction thread wait for error to be cleared + {"CompactionManifestWriteError:1", + "DBImpl::BackgroundCallCompaction:FoundObsoleteFiles"}, + // Wait for DB instance to clear bg_error before calling + // TEST_WaitForCompact + {"SstFileManagerImpl::ErrorCleared", "CompactionManifestWriteError:2"}}); + // trigger manifest write failure in compaction thread + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", [&](void*) { fail_manifest.store(true); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", [&](void*) { + if (fail_manifest.load()) { + fault_env->SetFilesystemActive(false, + Status::NoSpace("Out of space")); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Put(Key(1), "val"); + // This Flush will trigger a compaction, which will fail when appending to + // the manifest + s = Flush(); + ASSERT_EQ(s, Status::OK()); + + TEST_SYNC_POINT("CompactionManifestWriteError:0"); + // Clear all errors so when the compaction is retried, it will succeed + fault_env->SetFilesystemActive(true); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + TEST_SYNC_POINT("CompactionManifestWriteError:1"); + TEST_SYNC_POINT("CompactionManifestWriteError:2"); + + s = dbfull()->TEST_WaitForCompact(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ASSERT_EQ(s, Status::OK()); + + new_manifest = GetManifestNameFromLiveFiles(); + ASSERT_NE(new_manifest, old_manifest); + Reopen(options); + ASSERT_EQ("val", Get(Key(0))); + ASSERT_EQ("val", Get(Key(1))); + ASSERT_EQ("val", Get(Key(2))); + Close(); +} + +TEST_F(DBErrorHandlingTest, CompactionWriteError) { + std::unique_ptr fault_env( + new FaultInjectionTestEnv(Env::Default())); + std::shared_ptr listener(new ErrorHandlerListener()); + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = 2; + options.listeners.emplace_back(listener); + options.env = fault_env.get(); + Status s; + DestroyAndReopen(options); + + Put(Key(0), "va;"); + Put(Key(2), "va;"); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + + listener->OverrideBGError( + Status(Status::NoSpace(), Status::Severity::kHardError) + ); + listener->EnableAutoRecovery(false); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", [&](void*) { + fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space")); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Put(Key(1), "val"); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + + fault_env->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_EQ(s, Status::OK()); + Destroy(options); +} + +TEST_F(DBErrorHandlingTest, CorruptionError) { + std::unique_ptr fault_env( + new FaultInjectionTestEnv(Env::Default())); + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = 2; + options.env = fault_env.get(); + Status s; + DestroyAndReopen(options); + + Put(Key(0), "va;"); + Put(Key(2), "va;"); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", [&](void*) { + fault_env->SetFilesystemActive(false, Status::Corruption("Corruption")); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Put(Key(1), "val"); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s.severity(), + ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError); + + fault_env->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_NE(s, Status::OK()); + Destroy(options); +} + +TEST_F(DBErrorHandlingTest, AutoRecoverFlushError) { + std::unique_ptr fault_env( + new FaultInjectionTestEnv(Env::Default())); + std::shared_ptr listener(new ErrorHandlerListener()); + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.env = fault_env.get(); + options.listeners.emplace_back(listener); + Status s; + + listener->EnableAutoRecovery(); + DestroyAndReopen(options); + + Put(Key(0), "val"); + SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) { + fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space")); + }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_env->SetFilesystemActive(true); + ASSERT_EQ(listener->WaitForRecovery(5000000), true); + + s = Put(Key(1), "val"); + ASSERT_EQ(s, Status::OK()); + + Reopen(options); + ASSERT_EQ("val", Get(Key(0))); + ASSERT_EQ("val", Get(Key(1))); + Destroy(options); +} + +TEST_F(DBErrorHandlingTest, FailRecoverFlushError) { + std::unique_ptr fault_env( + new FaultInjectionTestEnv(Env::Default())); + std::shared_ptr listener(new ErrorHandlerListener()); + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.env = fault_env.get(); + options.listeners.emplace_back(listener); + Status s; + + listener->EnableAutoRecovery(); + DestroyAndReopen(options); + + Put(Key(0), "val"); + SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) { + fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space")); + }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + // We should be able to shutdown the database while auto recovery is going + // on in the background + Close(); + DestroyDB(dbname_, options); +} + +TEST_F(DBErrorHandlingTest, WALWriteError) { + std::unique_ptr fault_env( + new FaultInjectionTestEnv(Env::Default())); + std::shared_ptr listener(new ErrorHandlerListener()); + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.writable_file_max_buffer_size = 32768; + options.env = fault_env.get(); + options.listeners.emplace_back(listener); + Status s; + Random rnd(301); + + listener->EnableAutoRecovery(); + DestroyAndReopen(options); + + { + WriteBatch batch; + + for (auto i = 0; i<100; ++i) { + batch.Put(Key(i), RandomString(&rnd, 1024)); + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_EQ(dbfull()->Write(wopts, &batch), Status::OK()); + }; + + { + WriteBatch batch; + int write_error = 0; + + for (auto i = 100; i<199; ++i) { + batch.Put(Key(i), RandomString(&rnd, 1024)); + } + + SyncPoint::GetInstance()->SetCallBack("WritableFileWriter::Append:BeforePrepareWrite", [&](void*) { + write_error++; + if (write_error > 2) { + fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space")); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + WriteOptions wopts; + wopts.sync = true; + s = dbfull()->Write(wopts, &batch); + ASSERT_EQ(s, s.NoSpace()); + } + SyncPoint::GetInstance()->DisableProcessing(); + fault_env->SetFilesystemActive(true); + ASSERT_EQ(listener->WaitForRecovery(5000000), true); + for (auto i=0; i<199; ++i) { + if (i < 100) { + ASSERT_NE(Get(Key(i)), "NOT_FOUND"); + } else { + ASSERT_EQ(Get(Key(i)), "NOT_FOUND"); + } + } + Reopen(options); + for (auto i=0; i<199; ++i) { + if (i < 100) { + ASSERT_NE(Get(Key(i)), "NOT_FOUND"); + } else { + ASSERT_EQ(Get(Key(i)), "NOT_FOUND"); + } + } + Close(); +} + +TEST_F(DBErrorHandlingTest, MultiCFWALWriteError) { + std::unique_ptr fault_env( + new FaultInjectionTestEnv(Env::Default())); + std::shared_ptr listener(new ErrorHandlerListener()); + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.writable_file_max_buffer_size = 32768; + options.env = fault_env.get(); + options.listeners.emplace_back(listener); + Status s; + Random rnd(301); + + listener->EnableAutoRecovery(); + CreateAndReopenWithCF({"one", "two", "three"}, options); + + { + WriteBatch batch; + + for (auto i = 1; i < 4; ++i) { + for (auto j = 0; j < 100; ++j) { + batch.Put(handles_[i], Key(j), RandomString(&rnd, 1024)); + } + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_EQ(dbfull()->Write(wopts, &batch), Status::OK()); + }; + + { + WriteBatch batch; + int write_error = 0; + + // Write to one CF + for (auto i = 100; i < 199; ++i) { + batch.Put(handles_[2], Key(i), RandomString(&rnd, 1024)); + } + + SyncPoint::GetInstance()->SetCallBack( + "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) { + write_error++; + if (write_error > 2) { + fault_env->SetFilesystemActive(false, + Status::NoSpace("Out of space")); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + WriteOptions wopts; + wopts.sync = true; + s = dbfull()->Write(wopts, &batch); + ASSERT_EQ(s, s.NoSpace()); + } + SyncPoint::GetInstance()->DisableProcessing(); + fault_env->SetFilesystemActive(true); + ASSERT_EQ(listener->WaitForRecovery(5000000), true); + + for (auto i = 1; i < 4; ++i) { + // Every CF should have been flushed + ASSERT_EQ(NumTableFilesAtLevel(0, i), 1); + } + + for (auto i = 1; i < 4; ++i) { + for (auto j = 0; j < 199; ++j) { + if (j < 100) { + ASSERT_NE(Get(i, Key(j)), "NOT_FOUND"); + } else { + ASSERT_EQ(Get(i, Key(j)), "NOT_FOUND"); + } + } + } + ReopenWithColumnFamilies({"default", "one", "two", "three"}, options); + for (auto i = 1; i < 4; ++i) { + for (auto j = 0; j < 199; ++j) { + if (j < 100) { + ASSERT_NE(Get(i, Key(j)), "NOT_FOUND"); + } else { + ASSERT_EQ(Get(i, Key(j)), "NOT_FOUND"); + } + } + } + Close(); +} + +TEST_F(DBErrorHandlingTest, MultiDBCompactionError) { + FaultInjectionTestEnv* def_env = new FaultInjectionTestEnv(Env::Default()); + std::vector> fault_env; + std::vector options; + std::vector> listener; + std::vector db; + std::shared_ptr sfm(NewSstFileManager(def_env)); + int kNumDbInstances = 3; + Random rnd(301); + + for (auto i = 0; i < kNumDbInstances; ++i) { + listener.emplace_back(new ErrorHandlerListener()); + options.emplace_back(GetDefaultOptions()); + fault_env.emplace_back(new FaultInjectionTestEnv(Env::Default())); + options[i].create_if_missing = true; + options[i].level0_file_num_compaction_trigger = 2; + options[i].writable_file_max_buffer_size = 32768; + options[i].env = fault_env[i].get(); + options[i].listeners.emplace_back(listener[i]); + options[i].sst_file_manager = sfm; + DB* dbptr; + char buf[16]; + + listener[i]->EnableAutoRecovery(); + // Setup for returning error for the 3rd SST, which would be level 1 + listener[i]->InjectFileCreationError(fault_env[i].get(), 3, + Status::NoSpace("Out of space")); + snprintf(buf, sizeof(buf), "_%d", i); + DestroyDB(dbname_ + std::string(buf), options[i]); + ASSERT_EQ(DB::Open(options[i], dbname_ + std::string(buf), &dbptr), + Status::OK()); + db.emplace_back(dbptr); + } + + for (auto i = 0; i < kNumDbInstances; ++i) { + WriteBatch batch; + + for (auto j = 0; j <= 100; ++j) { + batch.Put(Key(j), RandomString(&rnd, 1024)); + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK()); + ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK()); + } + + def_env->SetFilesystemActive(false, Status::NoSpace("Out of space")); + for (auto i = 0; i < kNumDbInstances; ++i) { + WriteBatch batch; + + // Write to one CF + for (auto j = 100; j < 199; ++j) { + batch.Put(Key(j), RandomString(&rnd, 1024)); + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK()); + ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK()); + } + + for (auto i = 0; i < kNumDbInstances; ++i) { + Status s = static_cast(db[i])->TEST_WaitForCompact(true); + ASSERT_EQ(s.severity(), Status::Severity::kSoftError); + fault_env[i]->SetFilesystemActive(true); + } + + def_env->SetFilesystemActive(true); + for (auto i = 0; i < kNumDbInstances; ++i) { + std::string prop; + ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true); + ASSERT_EQ(static_cast(db[i])->TEST_WaitForCompact(true), + Status::OK()); + EXPECT_TRUE(db[i]->GetProperty( + "rocksdb.num-files-at-level" + NumberToString(0), &prop)); + EXPECT_EQ(atoi(prop.c_str()), 0); + EXPECT_TRUE(db[i]->GetProperty( + "rocksdb.num-files-at-level" + NumberToString(1), &prop)); + EXPECT_EQ(atoi(prop.c_str()), 1); + } + + for (auto i = 0; i < kNumDbInstances; ++i) { + char buf[16]; + snprintf(buf, sizeof(buf), "_%d", i); + delete db[i]; + fault_env[i]->SetFilesystemActive(true); + if (getenv("KEEP_DB")) { + printf("DB is still at %s%s\n", dbname_.c_str(), buf); + } else { + Status s = DestroyDB(dbname_ + std::string(buf), options[i]); + } + } + options.clear(); + sfm.reset(); + delete def_env; +} + +TEST_F(DBErrorHandlingTest, MultiDBVariousErrors) { + FaultInjectionTestEnv* def_env = new FaultInjectionTestEnv(Env::Default()); + std::vector> fault_env; + std::vector options; + std::vector> listener; + std::vector db; + std::shared_ptr sfm(NewSstFileManager(def_env)); + int kNumDbInstances = 3; + Random rnd(301); + + for (auto i = 0; i < kNumDbInstances; ++i) { + listener.emplace_back(new ErrorHandlerListener()); + options.emplace_back(GetDefaultOptions()); + fault_env.emplace_back(new FaultInjectionTestEnv(Env::Default())); + options[i].create_if_missing = true; + options[i].level0_file_num_compaction_trigger = 2; + options[i].writable_file_max_buffer_size = 32768; + options[i].env = fault_env[i].get(); + options[i].listeners.emplace_back(listener[i]); + options[i].sst_file_manager = sfm; + DB* dbptr; + char buf[16]; + + listener[i]->EnableAutoRecovery(); + switch (i) { + case 0: + // Setup for returning error for the 3rd SST, which would be level 1 + listener[i]->InjectFileCreationError(fault_env[i].get(), 3, + Status::NoSpace("Out of space")); + break; + case 1: + // Setup for returning error after the 1st SST, which would result + // in a hard error + listener[i]->InjectFileCreationError(fault_env[i].get(), 2, + Status::NoSpace("Out of space")); + break; + default: + break; + } + snprintf(buf, sizeof(buf), "_%d", i); + DestroyDB(dbname_ + std::string(buf), options[i]); + ASSERT_EQ(DB::Open(options[i], dbname_ + std::string(buf), &dbptr), + Status::OK()); + db.emplace_back(dbptr); + } + + for (auto i = 0; i < kNumDbInstances; ++i) { + WriteBatch batch; + + for (auto j = 0; j <= 100; ++j) { + batch.Put(Key(j), RandomString(&rnd, 1024)); + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK()); + ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK()); + } + + def_env->SetFilesystemActive(false, Status::NoSpace("Out of space")); + for (auto i = 0; i < kNumDbInstances; ++i) { + WriteBatch batch; + + // Write to one CF + for (auto j = 100; j < 199; ++j) { + batch.Put(Key(j), RandomString(&rnd, 1024)); + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK()); + if (i != 1) { + ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK()); + } else { + ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::NoSpace()); + } + } + + for (auto i = 0; i < kNumDbInstances; ++i) { + Status s = static_cast(db[i])->TEST_WaitForCompact(true); + switch (i) { + case 0: + ASSERT_EQ(s.severity(), Status::Severity::kSoftError); + break; + case 1: + ASSERT_EQ(s.severity(), Status::Severity::kHardError); + break; + case 2: + ASSERT_EQ(s, Status::OK()); + break; + } + fault_env[i]->SetFilesystemActive(true); + } + + def_env->SetFilesystemActive(true); + for (auto i = 0; i < kNumDbInstances; ++i) { + std::string prop; + if (i < 2) { + ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true); + } + if (i == 1) { + ASSERT_EQ(static_cast(db[i])->TEST_WaitForCompact(true), + Status::OK()); + } + EXPECT_TRUE(db[i]->GetProperty( + "rocksdb.num-files-at-level" + NumberToString(0), &prop)); + EXPECT_EQ(atoi(prop.c_str()), 0); + EXPECT_TRUE(db[i]->GetProperty( + "rocksdb.num-files-at-level" + NumberToString(1), &prop)); + EXPECT_EQ(atoi(prop.c_str()), 1); + } + + for (auto i = 0; i < kNumDbInstances; ++i) { + char buf[16]; + snprintf(buf, sizeof(buf), "_%d", i); + fault_env[i]->SetFilesystemActive(true); + delete db[i]; + if (getenv("KEEP_DB")) { + printf("DB is still at %s%s\n", dbname_.c_str(), buf); + } else { + DestroyDB(dbname_ + std::string(buf), options[i]); + } + } + options.clear(); + delete def_env; +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/db/event_helpers.cc b/src/rocksdb/db/event_helpers.cc new file mode 100644 index 000000000..57aa711fc --- /dev/null +++ b/src/rocksdb/db/event_helpers.cc @@ -0,0 +1,223 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/event_helpers.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { +template +inline T SafeDivide(T a, T b) { + return b == 0 ? 0 : a / b; +} +} // namespace + +void EventHelpers::AppendCurrentTime(JSONWriter* jwriter) { + *jwriter << "time_micros" + << std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); +} + +#ifndef ROCKSDB_LITE +void EventHelpers::NotifyTableFileCreationStarted( + const std::vector>& listeners, + const std::string& db_name, const std::string& cf_name, + const std::string& file_path, int job_id, TableFileCreationReason reason) { + TableFileCreationBriefInfo info; + info.db_name = db_name; + info.cf_name = cf_name; + info.file_path = file_path; + info.job_id = job_id; + info.reason = reason; + for (auto& listener : listeners) { + listener->OnTableFileCreationStarted(info); + } +} +#endif // !ROCKSDB_LITE + +void EventHelpers::NotifyOnBackgroundError( + const std::vector>& listeners, + BackgroundErrorReason reason, Status* bg_error, InstrumentedMutex* db_mutex, + bool* auto_recovery) { +#ifndef ROCKSDB_LITE + if (listeners.size() == 0U) { + return; + } + db_mutex->AssertHeld(); + // release lock while notifying events + db_mutex->Unlock(); + for (auto& listener : listeners) { + listener->OnBackgroundError(reason, bg_error); + if (*auto_recovery) { + listener->OnErrorRecoveryBegin(reason, *bg_error, auto_recovery); + } + } + db_mutex->Lock(); +#else + (void)listeners; + (void)reason; + (void)bg_error; + (void)db_mutex; + (void)auto_recovery; +#endif // ROCKSDB_LITE +} + +void EventHelpers::LogAndNotifyTableFileCreationFinished( + EventLogger* event_logger, + const std::vector>& listeners, + const std::string& db_name, const std::string& cf_name, + const std::string& file_path, int job_id, const FileDescriptor& fd, + uint64_t oldest_blob_file_number, const TableProperties& table_properties, + TableFileCreationReason reason, const Status& s) { + if (s.ok() && event_logger) { + JSONWriter jwriter; + AppendCurrentTime(&jwriter); + jwriter << "cf_name" << cf_name << "job" << job_id << "event" + << "table_file_creation" + << "file_number" << fd.GetNumber() << "file_size" + << fd.GetFileSize(); + + // table_properties + { + jwriter << "table_properties"; + jwriter.StartObject(); + + // basic properties: + jwriter << "data_size" << table_properties.data_size << "index_size" + << table_properties.index_size << "index_partitions" + << table_properties.index_partitions << "top_level_index_size" + << table_properties.top_level_index_size + << "index_key_is_user_key" + << table_properties.index_key_is_user_key + << "index_value_is_delta_encoded" + << table_properties.index_value_is_delta_encoded << "filter_size" + << table_properties.filter_size << "raw_key_size" + << table_properties.raw_key_size << "raw_average_key_size" + << SafeDivide(table_properties.raw_key_size, + table_properties.num_entries) + << "raw_value_size" << table_properties.raw_value_size + << "raw_average_value_size" + << SafeDivide(table_properties.raw_value_size, + table_properties.num_entries) + << "num_data_blocks" << table_properties.num_data_blocks + << "num_entries" << table_properties.num_entries + << "num_deletions" << table_properties.num_deletions + << "num_merge_operands" << table_properties.num_merge_operands + << "num_range_deletions" << table_properties.num_range_deletions + << "format_version" << table_properties.format_version + << "fixed_key_len" << table_properties.fixed_key_len + << "filter_policy" << table_properties.filter_policy_name + << "column_family_name" << table_properties.column_family_name + << "column_family_id" << table_properties.column_family_id + << "comparator" << table_properties.comparator_name + << "merge_operator" << table_properties.merge_operator_name + << "prefix_extractor_name" + << table_properties.prefix_extractor_name << "property_collectors" + << table_properties.property_collectors_names << "compression" + << table_properties.compression_name << "compression_options" + << table_properties.compression_options << "creation_time" + << table_properties.creation_time << "oldest_key_time" + << table_properties.oldest_key_time << "file_creation_time" + << table_properties.file_creation_time; + + // user collected properties + for (const auto& prop : table_properties.readable_properties) { + jwriter << prop.first << prop.second; + } + jwriter.EndObject(); + } + + if (oldest_blob_file_number != kInvalidBlobFileNumber) { + jwriter << "oldest_blob_file_number" << oldest_blob_file_number; + } + + jwriter.EndObject(); + + event_logger->Log(jwriter); + } + +#ifndef ROCKSDB_LITE + if (listeners.size() == 0) { + return; + } + TableFileCreationInfo info; + info.db_name = db_name; + info.cf_name = cf_name; + info.file_path = file_path; + info.file_size = fd.file_size; + info.job_id = job_id; + info.table_properties = table_properties; + info.reason = reason; + info.status = s; + for (auto& listener : listeners) { + listener->OnTableFileCreated(info); + } +#else + (void)listeners; + (void)db_name; + (void)cf_name; + (void)file_path; + (void)reason; +#endif // !ROCKSDB_LITE +} + +void EventHelpers::LogAndNotifyTableFileDeletion( + EventLogger* event_logger, int job_id, uint64_t file_number, + const std::string& file_path, const Status& status, + const std::string& dbname, + const std::vector>& listeners) { + JSONWriter jwriter; + AppendCurrentTime(&jwriter); + + jwriter << "job" << job_id << "event" + << "table_file_deletion" + << "file_number" << file_number; + if (!status.ok()) { + jwriter << "status" << status.ToString(); + } + + jwriter.EndObject(); + + event_logger->Log(jwriter); + +#ifndef ROCKSDB_LITE + TableFileDeletionInfo info; + info.db_name = dbname; + info.job_id = job_id; + info.file_path = file_path; + info.status = status; + for (auto& listener : listeners) { + listener->OnTableFileDeleted(info); + } +#else + (void)file_path; + (void)dbname; + (void)listeners; +#endif // !ROCKSDB_LITE +} + +void EventHelpers::NotifyOnErrorRecoveryCompleted( + const std::vector>& listeners, + Status old_bg_error, InstrumentedMutex* db_mutex) { +#ifndef ROCKSDB_LITE + if (listeners.size() == 0U) { + return; + } + db_mutex->AssertHeld(); + // release lock while notifying events + db_mutex->Unlock(); + for (auto& listener : listeners) { + listener->OnErrorRecoveryCompleted(old_bg_error); + } + db_mutex->Lock(); +#else + (void)listeners; + (void)old_bg_error; + (void)db_mutex; +#endif // ROCKSDB_LITE +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/event_helpers.h b/src/rocksdb/db/event_helpers.h new file mode 100644 index 000000000..87cc1cb8c --- /dev/null +++ b/src/rocksdb/db/event_helpers.h @@ -0,0 +1,55 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include +#include +#include + +#include "db/column_family.h" +#include "db/version_edit.h" +#include "logging/event_logger.h" +#include "rocksdb/listener.h" +#include "rocksdb/table_properties.h" + +namespace ROCKSDB_NAMESPACE { + +class EventHelpers { + public: + static void AppendCurrentTime(JSONWriter* json_writer); +#ifndef ROCKSDB_LITE + static void NotifyTableFileCreationStarted( + const std::vector>& listeners, + const std::string& db_name, const std::string& cf_name, + const std::string& file_path, int job_id, TableFileCreationReason reason); +#endif // !ROCKSDB_LITE + static void NotifyOnBackgroundError( + const std::vector>& listeners, + BackgroundErrorReason reason, Status* bg_error, + InstrumentedMutex* db_mutex, bool* auto_recovery); + static void LogAndNotifyTableFileCreationFinished( + EventLogger* event_logger, + const std::vector>& listeners, + const std::string& db_name, const std::string& cf_name, + const std::string& file_path, int job_id, const FileDescriptor& fd, + uint64_t oldest_blob_file_number, const TableProperties& table_properties, + TableFileCreationReason reason, const Status& s); + static void LogAndNotifyTableFileDeletion( + EventLogger* event_logger, int job_id, + uint64_t file_number, const std::string& file_path, + const Status& status, const std::string& db_name, + const std::vector>& listeners); + static void NotifyOnErrorRecoveryCompleted( + const std::vector>& listeners, + Status bg_error, InstrumentedMutex* db_mutex); + + private: + static void LogAndNotifyTableFileCreation( + EventLogger* event_logger, + const std::vector>& listeners, + const FileDescriptor& fd, const TableFileCreationInfo& info); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/experimental.cc b/src/rocksdb/db/experimental.cc new file mode 100644 index 000000000..d12882c8f --- /dev/null +++ b/src/rocksdb/db/experimental.cc @@ -0,0 +1,50 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/experimental.h" + +#include "db/db_impl/db_impl.h" + +namespace ROCKSDB_NAMESPACE { +namespace experimental { + +#ifndef ROCKSDB_LITE + +Status SuggestCompactRange(DB* db, ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end) { + if (db == nullptr) { + return Status::InvalidArgument("DB is empty"); + } + + return db->SuggestCompactRange(column_family, begin, end); +} + +Status PromoteL0(DB* db, ColumnFamilyHandle* column_family, int target_level) { + if (db == nullptr) { + return Status::InvalidArgument("Didn't recognize DB object"); + } + return db->PromoteL0(column_family, target_level); +} + +#else // ROCKSDB_LITE + +Status SuggestCompactRange(DB* /*db*/, ColumnFamilyHandle* /*column_family*/, + const Slice* /*begin*/, const Slice* /*end*/) { + return Status::NotSupported("Not supported in RocksDB LITE"); +} + +Status PromoteL0(DB* /*db*/, ColumnFamilyHandle* /*column_family*/, + int /*target_level*/) { + return Status::NotSupported("Not supported in RocksDB LITE"); +} + +#endif // ROCKSDB_LITE + +Status SuggestCompactRange(DB* db, const Slice* begin, const Slice* end) { + return SuggestCompactRange(db, db->DefaultColumnFamily(), begin, end); +} + +} // namespace experimental +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/external_sst_file_basic_test.cc b/src/rocksdb/db/external_sst_file_basic_test.cc new file mode 100644 index 000000000..b184df20e --- /dev/null +++ b/src/rocksdb/db/external_sst_file_basic_test.cc @@ -0,0 +1,1128 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include + +#include "db/db_test_util.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/sst_file_writer.h" +#include "test_util/fault_injection_test_env.h" +#include "test_util/testutil.h" + +namespace ROCKSDB_NAMESPACE { + +#ifndef ROCKSDB_LITE +class ExternalSSTFileBasicTest + : public DBTestBase, + public ::testing::WithParamInterface> { + public: + ExternalSSTFileBasicTest() : DBTestBase("/external_sst_file_basic_test") { + sst_files_dir_ = dbname_ + "/sst_files/"; + fault_injection_test_env_.reset(new FaultInjectionTestEnv(Env::Default())); + DestroyAndRecreateExternalSSTFilesDir(); + } + + void DestroyAndRecreateExternalSSTFilesDir() { + test::DestroyDir(env_, sst_files_dir_); + env_->CreateDir(sst_files_dir_); + } + + Status DeprecatedAddFile(const std::vector& files, + bool move_files = false, + bool skip_snapshot_check = false) { + IngestExternalFileOptions opts; + opts.move_files = move_files; + opts.snapshot_consistency = !skip_snapshot_check; + opts.allow_global_seqno = false; + opts.allow_blocking_flush = false; + return db_->IngestExternalFile(files, opts); + } + + Status GenerateAndAddExternalFile( + const Options options, std::vector keys, + const std::vector& value_types, + std::vector> range_deletions, int file_id, + bool write_global_seqno, bool verify_checksums_before_ingest, + std::map* true_data) { + assert(value_types.size() == 1 || keys.size() == value_types.size()); + std::string file_path = sst_files_dir_ + ToString(file_id); + SstFileWriter sst_file_writer(EnvOptions(), options); + + Status s = sst_file_writer.Open(file_path); + if (!s.ok()) { + return s; + } + for (size_t i = 0; i < range_deletions.size(); i++) { + // Account for the effect of range deletions on true_data before + // all point operators, even though sst_file_writer.DeleteRange + // must be called before other sst_file_writer methods. This is + // because point writes take precedence over range deletions + // in the same ingested sst. + std::string start_key = Key(range_deletions[i].first); + std::string end_key = Key(range_deletions[i].second); + s = sst_file_writer.DeleteRange(start_key, end_key); + if (!s.ok()) { + sst_file_writer.Finish(); + return s; + } + auto start_key_it = true_data->find(start_key); + if (start_key_it == true_data->end()) { + start_key_it = true_data->upper_bound(start_key); + } + auto end_key_it = true_data->find(end_key); + if (end_key_it == true_data->end()) { + end_key_it = true_data->upper_bound(end_key); + } + true_data->erase(start_key_it, end_key_it); + } + for (size_t i = 0; i < keys.size(); i++) { + std::string key = Key(keys[i]); + std::string value = Key(keys[i]) + ToString(file_id); + ValueType value_type = + (value_types.size() == 1 ? value_types[0] : value_types[i]); + switch (value_type) { + case ValueType::kTypeValue: + s = sst_file_writer.Put(key, value); + (*true_data)[key] = value; + break; + case ValueType::kTypeMerge: + s = sst_file_writer.Merge(key, value); + // we only use TestPutOperator in this test + (*true_data)[key] = value; + break; + case ValueType::kTypeDeletion: + s = sst_file_writer.Delete(key); + true_data->erase(key); + break; + default: + return Status::InvalidArgument("Value type is not supported"); + } + if (!s.ok()) { + sst_file_writer.Finish(); + return s; + } + } + s = sst_file_writer.Finish(); + + if (s.ok()) { + IngestExternalFileOptions ifo; + ifo.allow_global_seqno = true; + ifo.write_global_seqno = write_global_seqno; + ifo.verify_checksums_before_ingest = verify_checksums_before_ingest; + s = db_->IngestExternalFile({file_path}, ifo); + } + return s; + } + + Status GenerateAndAddExternalFile( + const Options options, std::vector keys, + const std::vector& value_types, int file_id, + bool write_global_seqno, bool verify_checksums_before_ingest, + std::map* true_data) { + return GenerateAndAddExternalFile( + options, keys, value_types, {}, file_id, write_global_seqno, + verify_checksums_before_ingest, true_data); + } + + Status GenerateAndAddExternalFile( + const Options options, std::vector keys, const ValueType value_type, + int file_id, bool write_global_seqno, bool verify_checksums_before_ingest, + std::map* true_data) { + return GenerateAndAddExternalFile( + options, keys, std::vector(1, value_type), file_id, + write_global_seqno, verify_checksums_before_ingest, true_data); + } + + ~ExternalSSTFileBasicTest() override { + test::DestroyDir(env_, sst_files_dir_); + } + + protected: + std::string sst_files_dir_; + std::unique_ptr fault_injection_test_env_; +}; + +TEST_F(ExternalSSTFileBasicTest, Basic) { + Options options = CurrentOptions(); + + SstFileWriter sst_file_writer(EnvOptions(), options); + + // Current file size should be 0 after sst_file_writer init and before open a + // file. + ASSERT_EQ(sst_file_writer.FileSize(), 0); + + // file1.sst (0 => 99) + std::string file1 = sst_files_dir_ + "file1.sst"; + ASSERT_OK(sst_file_writer.Open(file1)); + for (int k = 0; k < 100; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file1_info; + Status s = sst_file_writer.Finish(&file1_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + + // Current file size should be non-zero after success write. + ASSERT_GT(sst_file_writer.FileSize(), 0); + + ASSERT_EQ(file1_info.file_path, file1); + ASSERT_EQ(file1_info.num_entries, 100); + ASSERT_EQ(file1_info.smallest_key, Key(0)); + ASSERT_EQ(file1_info.largest_key, Key(99)); + ASSERT_EQ(file1_info.num_range_del_entries, 0); + ASSERT_EQ(file1_info.smallest_range_del_key, ""); + ASSERT_EQ(file1_info.largest_range_del_key, ""); + // sst_file_writer already finished, cannot add this value + s = sst_file_writer.Put(Key(100), "bad_val"); + ASSERT_FALSE(s.ok()) << s.ToString(); + s = sst_file_writer.DeleteRange(Key(100), Key(200)); + ASSERT_FALSE(s.ok()) << s.ToString(); + + DestroyAndReopen(options); + // Add file using file path + s = DeprecatedAddFile({file1}); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); + for (int k = 0; k < 100; k++) { + ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); + } + + DestroyAndRecreateExternalSSTFilesDir(); +} + +TEST_F(ExternalSSTFileBasicTest, NoCopy) { + Options options = CurrentOptions(); + const ImmutableCFOptions ioptions(options); + + SstFileWriter sst_file_writer(EnvOptions(), options); + + // file1.sst (0 => 99) + std::string file1 = sst_files_dir_ + "file1.sst"; + ASSERT_OK(sst_file_writer.Open(file1)); + for (int k = 0; k < 100; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file1_info; + Status s = sst_file_writer.Finish(&file1_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file1_info.file_path, file1); + ASSERT_EQ(file1_info.num_entries, 100); + ASSERT_EQ(file1_info.smallest_key, Key(0)); + ASSERT_EQ(file1_info.largest_key, Key(99)); + + // file2.sst (100 => 299) + std::string file2 = sst_files_dir_ + "file2.sst"; + ASSERT_OK(sst_file_writer.Open(file2)); + for (int k = 100; k < 300; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file2_info; + s = sst_file_writer.Finish(&file2_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file2_info.file_path, file2); + ASSERT_EQ(file2_info.num_entries, 200); + ASSERT_EQ(file2_info.smallest_key, Key(100)); + ASSERT_EQ(file2_info.largest_key, Key(299)); + + // file3.sst (110 => 124) .. overlap with file2.sst + std::string file3 = sst_files_dir_ + "file3.sst"; + ASSERT_OK(sst_file_writer.Open(file3)); + for (int k = 110; k < 125; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap")); + } + ExternalSstFileInfo file3_info; + s = sst_file_writer.Finish(&file3_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file3_info.file_path, file3); + ASSERT_EQ(file3_info.num_entries, 15); + ASSERT_EQ(file3_info.smallest_key, Key(110)); + ASSERT_EQ(file3_info.largest_key, Key(124)); + + s = DeprecatedAddFile({file1}, true /* move file */); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(Status::NotFound(), env_->FileExists(file1)); + + s = DeprecatedAddFile({file2}, false /* copy file */); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(env_->FileExists(file2)); + + // This file has overlapping values with the existing data + s = DeprecatedAddFile({file3}, true /* move file */); + ASSERT_FALSE(s.ok()) << s.ToString(); + ASSERT_OK(env_->FileExists(file3)); + + for (int k = 0; k < 300; k++) { + ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); + } +} + +TEST_P(ExternalSSTFileBasicTest, IngestFileWithGlobalSeqnoPickedSeqno) { + bool write_global_seqno = std::get<0>(GetParam()); + bool verify_checksums_before_ingest = std::get<1>(GetParam()); + do { + Options options = CurrentOptions(); + DestroyAndReopen(options); + std::map true_data; + + int file_id = 1; + + ASSERT_OK(GenerateAndAddExternalFile( + options, {1, 2, 3, 4, 5, 6}, ValueType::kTypeValue, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // File doesn't overwrite any keys, no seqno needed + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {10, 11, 12, 13}, ValueType::kTypeValue, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // File doesn't overwrite any keys, no seqno needed + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {1, 4, 6}, ValueType::kTypeValue, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // File overwrites some keys, a seqno will be assigned + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 1); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {11, 15, 19}, ValueType::kTypeValue, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // File overwrites some keys, a seqno will be assigned + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {120, 130}, ValueType::kTypeValue, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // File doesn't overwrite any keys, no seqno needed + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {1, 130}, ValueType::kTypeValue, file_id++, write_global_seqno, + verify_checksums_before_ingest, &true_data)); + // File overwrites some keys, a seqno will be assigned + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3); + + // Write some keys through normal write path + for (int i = 0; i < 50; i++) { + ASSERT_OK(Put(Key(i), "memtable")); + true_data[Key(i)] = "memtable"; + } + SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber(); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {60, 61, 62}, ValueType::kTypeValue, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // File doesn't overwrite any keys, no seqno needed + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {40, 41, 42}, ValueType::kTypeValue, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // File overwrites some keys, a seqno will be assigned + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 1); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {20, 30, 40}, ValueType::kTypeValue, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // File overwrites some keys, a seqno will be assigned + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 2); + + const Snapshot* snapshot = db_->GetSnapshot(); + + // We will need a seqno for the file regardless if the file overwrite + // keys in the DB or not because we have a snapshot + ASSERT_OK(GenerateAndAddExternalFile( + options, {1000, 1002}, ValueType::kTypeValue, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // A global seqno will be assigned anyway because of the snapshot + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 3); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {2000, 3002}, ValueType::kTypeValue, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // A global seqno will be assigned anyway because of the snapshot + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 4); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {1, 20, 40, 100, 150}, ValueType::kTypeValue, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // A global seqno will be assigned anyway because of the snapshot + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5); + + db_->ReleaseSnapshot(snapshot); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {5000, 5001}, ValueType::kTypeValue, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // No snapshot anymore, no need to assign a seqno + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5); + + size_t kcnt = 0; + VerifyDBFromMap(true_data, &kcnt, false); + } while (ChangeOptionsForFileIngestionTest()); +} + +TEST_P(ExternalSSTFileBasicTest, IngestFileWithMultipleValueType) { + bool write_global_seqno = std::get<0>(GetParam()); + bool verify_checksums_before_ingest = std::get<1>(GetParam()); + do { + Options options = CurrentOptions(); + options.merge_operator.reset(new TestPutOperator()); + DestroyAndReopen(options); + std::map true_data; + + int file_id = 1; + + ASSERT_OK(GenerateAndAddExternalFile( + options, {1, 2, 3, 4, 5, 6}, ValueType::kTypeValue, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // File doesn't overwrite any keys, no seqno needed + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {10, 11, 12, 13}, ValueType::kTypeValue, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // File doesn't overwrite any keys, no seqno needed + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {1, 4, 6}, ValueType::kTypeMerge, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // File overwrites some keys, a seqno will be assigned + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 1); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {11, 15, 19}, ValueType::kTypeDeletion, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // File overwrites some keys, a seqno will be assigned + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {120, 130}, ValueType::kTypeMerge, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // File doesn't overwrite any keys, no seqno needed + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {1, 130}, ValueType::kTypeDeletion, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // File overwrites some keys, a seqno will be assigned + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {120}, {ValueType::kTypeValue}, {{120, 135}}, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // File overwrites some keys, a seqno will be assigned + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {}, {}, {{110, 120}}, file_id++, write_global_seqno, + verify_checksums_before_ingest, &true_data)); + // The range deletion ends on a key, but it doesn't actually delete + // this key because the largest key in the range is exclusive. Still, + // it counts as an overlap so a new seqno will be assigned. + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 5); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {}, {}, {{100, 109}}, file_id++, write_global_seqno, + verify_checksums_before_ingest, &true_data)); + // File doesn't overwrite any keys, no seqno needed + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 5); + + // Write some keys through normal write path + for (int i = 0; i < 50; i++) { + ASSERT_OK(Put(Key(i), "memtable")); + true_data[Key(i)] = "memtable"; + } + SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber(); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {60, 61, 62}, ValueType::kTypeValue, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // File doesn't overwrite any keys, no seqno needed + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {40, 41, 42}, ValueType::kTypeMerge, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // File overwrites some keys, a seqno will be assigned + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 1); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {20, 30, 40}, ValueType::kTypeDeletion, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // File overwrites some keys, a seqno will be assigned + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 2); + + const Snapshot* snapshot = db_->GetSnapshot(); + + // We will need a seqno for the file regardless if the file overwrite + // keys in the DB or not because we have a snapshot + ASSERT_OK(GenerateAndAddExternalFile( + options, {1000, 1002}, ValueType::kTypeMerge, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // A global seqno will be assigned anyway because of the snapshot + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 3); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {2000, 3002}, ValueType::kTypeMerge, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // A global seqno will be assigned anyway because of the snapshot + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 4); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {1, 20, 40, 100, 150}, ValueType::kTypeMerge, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // A global seqno will be assigned anyway because of the snapshot + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5); + + db_->ReleaseSnapshot(snapshot); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {5000, 5001}, ValueType::kTypeValue, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data)); + // No snapshot anymore, no need to assign a seqno + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5); + + size_t kcnt = 0; + VerifyDBFromMap(true_data, &kcnt, false); + } while (ChangeOptionsForFileIngestionTest()); +} + +TEST_P(ExternalSSTFileBasicTest, IngestFileWithMixedValueType) { + bool write_global_seqno = std::get<0>(GetParam()); + bool verify_checksums_before_ingest = std::get<1>(GetParam()); + do { + Options options = CurrentOptions(); + options.merge_operator.reset(new TestPutOperator()); + DestroyAndReopen(options); + std::map true_data; + + int file_id = 1; + + ASSERT_OK(GenerateAndAddExternalFile( + options, {1, 2, 3, 4, 5, 6}, + {ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeValue, + ValueType::kTypeMerge, ValueType::kTypeValue, ValueType::kTypeMerge}, + file_id++, write_global_seqno, verify_checksums_before_ingest, + &true_data)); + // File doesn't overwrite any keys, no seqno needed + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {10, 11, 12, 13}, + {ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeValue, + ValueType::kTypeMerge}, + file_id++, write_global_seqno, verify_checksums_before_ingest, + &true_data)); + // File doesn't overwrite any keys, no seqno needed + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {1, 4, 6}, + {ValueType::kTypeDeletion, ValueType::kTypeValue, + ValueType::kTypeMerge}, + file_id++, write_global_seqno, verify_checksums_before_ingest, + &true_data)); + // File overwrites some keys, a seqno will be assigned + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 1); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {11, 15, 19}, + {ValueType::kTypeDeletion, ValueType::kTypeMerge, + ValueType::kTypeValue}, + file_id++, write_global_seqno, verify_checksums_before_ingest, + &true_data)); + // File overwrites some keys, a seqno will be assigned + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {120, 130}, {ValueType::kTypeValue, ValueType::kTypeMerge}, + file_id++, write_global_seqno, verify_checksums_before_ingest, + &true_data)); + // File doesn't overwrite any keys, no seqno needed + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {1, 130}, {ValueType::kTypeMerge, ValueType::kTypeDeletion}, + file_id++, write_global_seqno, verify_checksums_before_ingest, + &true_data)); + // File overwrites some keys, a seqno will be assigned + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {150, 151, 152}, + {ValueType::kTypeValue, ValueType::kTypeMerge, + ValueType::kTypeDeletion}, + {{150, 160}, {180, 190}}, file_id++, write_global_seqno, + verify_checksums_before_ingest, &true_data)); + // File doesn't overwrite any keys, no seqno needed + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {150, 151, 152}, + {ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeValue}, + {{200, 250}}, file_id++, write_global_seqno, + verify_checksums_before_ingest, &true_data)); + // File overwrites some keys, a seqno will be assigned + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {300, 301, 302}, + {ValueType::kTypeValue, ValueType::kTypeMerge, + ValueType::kTypeDeletion}, + {{1, 2}, {152, 154}}, file_id++, write_global_seqno, + verify_checksums_before_ingest, &true_data)); + // File overwrites some keys, a seqno will be assigned + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 5); + + // Write some keys through normal write path + for (int i = 0; i < 50; i++) { + ASSERT_OK(Put(Key(i), "memtable")); + true_data[Key(i)] = "memtable"; + } + SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber(); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {60, 61, 62}, + {ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeValue}, + file_id++, write_global_seqno, verify_checksums_before_ingest, + &true_data)); + // File doesn't overwrite any keys, no seqno needed + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {40, 41, 42}, + {ValueType::kTypeValue, ValueType::kTypeDeletion, + ValueType::kTypeDeletion}, + file_id++, write_global_seqno, verify_checksums_before_ingest, + &true_data)); + // File overwrites some keys, a seqno will be assigned + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 1); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {20, 30, 40}, + {ValueType::kTypeDeletion, ValueType::kTypeDeletion, + ValueType::kTypeDeletion}, + file_id++, write_global_seqno, verify_checksums_before_ingest, + &true_data)); + // File overwrites some keys, a seqno will be assigned + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 2); + + const Snapshot* snapshot = db_->GetSnapshot(); + + // We will need a seqno for the file regardless if the file overwrite + // keys in the DB or not because we have a snapshot + ASSERT_OK(GenerateAndAddExternalFile( + options, {1000, 1002}, {ValueType::kTypeValue, ValueType::kTypeMerge}, + file_id++, write_global_seqno, verify_checksums_before_ingest, + &true_data)); + // A global seqno will be assigned anyway because of the snapshot + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 3); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {2000, 3002}, {ValueType::kTypeValue, ValueType::kTypeMerge}, + file_id++, write_global_seqno, verify_checksums_before_ingest, + &true_data)); + // A global seqno will be assigned anyway because of the snapshot + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 4); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {1, 20, 40, 100, 150}, + {ValueType::kTypeDeletion, ValueType::kTypeDeletion, + ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeMerge}, + file_id++, write_global_seqno, verify_checksums_before_ingest, + &true_data)); + // A global seqno will be assigned anyway because of the snapshot + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5); + + db_->ReleaseSnapshot(snapshot); + + ASSERT_OK(GenerateAndAddExternalFile( + options, {5000, 5001}, {ValueType::kTypeValue, ValueType::kTypeMerge}, + file_id++, write_global_seqno, verify_checksums_before_ingest, + &true_data)); + // No snapshot anymore, no need to assign a seqno + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5); + + size_t kcnt = 0; + VerifyDBFromMap(true_data, &kcnt, false); + } while (ChangeOptionsForFileIngestionTest()); +} + +TEST_F(ExternalSSTFileBasicTest, FadviseTrigger) { + Options options = CurrentOptions(); + const int kNumKeys = 10000; + + size_t total_fadvised_bytes = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileWriter::Rep::InvalidatePageCache", [&](void* arg) { + size_t fadvise_size = *(reinterpret_cast(arg)); + total_fadvised_bytes += fadvise_size; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + std::unique_ptr sst_file_writer; + + std::string sst_file_path = sst_files_dir_ + "file_fadvise_disable.sst"; + sst_file_writer.reset( + new SstFileWriter(EnvOptions(), options, nullptr, false)); + ASSERT_OK(sst_file_writer->Open(sst_file_path)); + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(sst_file_writer->Put(Key(i), Key(i))); + } + ASSERT_OK(sst_file_writer->Finish()); + // fadvise disabled + ASSERT_EQ(total_fadvised_bytes, 0); + + sst_file_path = sst_files_dir_ + "file_fadvise_enable.sst"; + sst_file_writer.reset( + new SstFileWriter(EnvOptions(), options, nullptr, true)); + ASSERT_OK(sst_file_writer->Open(sst_file_path)); + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(sst_file_writer->Put(Key(i), Key(i))); + } + ASSERT_OK(sst_file_writer->Finish()); + // fadvise enabled + ASSERT_EQ(total_fadvised_bytes, sst_file_writer->FileSize()); + ASSERT_GT(total_fadvised_bytes, 0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(ExternalSSTFileBasicTest, SyncFailure) { + Options options; + options.create_if_missing = true; + options.env = fault_injection_test_env_.get(); + + std::vector> test_cases = { + {"ExternalSstFileIngestionJob::BeforeSyncIngestedFile", + "ExternalSstFileIngestionJob::AfterSyncIngestedFile"}, + {"ExternalSstFileIngestionJob::BeforeSyncDir", + "ExternalSstFileIngestionJob::AfterSyncDir"}, + {"ExternalSstFileIngestionJob::BeforeSyncGlobalSeqno", + "ExternalSstFileIngestionJob::AfterSyncGlobalSeqno"}}; + + for (size_t i = 0; i < test_cases.size(); i++) { + SyncPoint::GetInstance()->SetCallBack(test_cases[i].first, [&](void*) { + fault_injection_test_env_->SetFilesystemActive(false); + }); + SyncPoint::GetInstance()->SetCallBack(test_cases[i].second, [&](void*) { + fault_injection_test_env_->SetFilesystemActive(true); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + DestroyAndReopen(options); + if (i == 2) { + ASSERT_OK(Put("foo", "v1")); + } + + Options sst_file_writer_options; + std::unique_ptr sst_file_writer( + new SstFileWriter(EnvOptions(), sst_file_writer_options)); + std::string file_name = + sst_files_dir_ + "sync_failure_test_" + ToString(i) + ".sst"; + ASSERT_OK(sst_file_writer->Open(file_name)); + ASSERT_OK(sst_file_writer->Put("bar", "v2")); + ASSERT_OK(sst_file_writer->Finish()); + + IngestExternalFileOptions ingest_opt; + if (i == 0) { + ingest_opt.move_files = true; + } + const Snapshot* snapshot = db_->GetSnapshot(); + if (i == 2) { + ingest_opt.write_global_seqno = true; + } + ASSERT_FALSE(db_->IngestExternalFile({file_name}, ingest_opt).ok()); + db_->ReleaseSnapshot(snapshot); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + Destroy(options); + } +} + +TEST_F(ExternalSSTFileBasicTest, VerifyChecksumReadahead) { + Options options; + options.create_if_missing = true; + SpecialEnv senv(Env::Default()); + options.env = &senv; + DestroyAndReopen(options); + + Options sst_file_writer_options; + std::unique_ptr sst_file_writer( + new SstFileWriter(EnvOptions(), sst_file_writer_options)); + std::string file_name = sst_files_dir_ + "verify_checksum_readahead_test.sst"; + ASSERT_OK(sst_file_writer->Open(file_name)); + Random rnd(301); + std::string value = DBTestBase::RandomString(&rnd, 4000); + for (int i = 0; i < 5000; i++) { + ASSERT_OK(sst_file_writer->Put(DBTestBase::Key(i), value)); + } + ASSERT_OK(sst_file_writer->Finish()); + + // Ingest it once without verifying checksums to see the baseline + // preads. + IngestExternalFileOptions ingest_opt; + ingest_opt.move_files = false; + senv.count_random_reads_ = true; + senv.random_read_bytes_counter_ = 0; + ASSERT_OK(db_->IngestExternalFile({file_name}, ingest_opt)); + + auto base_num_reads = senv.random_read_counter_.Read(); + // Make sure the counter is enabled. + ASSERT_GT(base_num_reads, 0); + + // Ingest again and observe the reads made for for readahead. + ingest_opt.move_files = false; + ingest_opt.verify_checksums_before_ingest = true; + ingest_opt.verify_checksums_readahead_size = size_t{2 * 1024 * 1024}; + + senv.count_random_reads_ = true; + senv.random_read_bytes_counter_ = 0; + ASSERT_OK(db_->IngestExternalFile({file_name}, ingest_opt)); + + // Make sure the counter is enabled. + ASSERT_GT(senv.random_read_counter_.Read() - base_num_reads, 0); + + // The SST file is about 20MB. Readahead size is 2MB. + // Give a conservative 15 reads for metadata blocks, the number + // of random reads should be within 20 MB / 2MB + 15 = 25. + ASSERT_LE(senv.random_read_counter_.Read() - base_num_reads, 40); + + Destroy(options); +} + +TEST_P(ExternalSSTFileBasicTest, IngestionWithRangeDeletions) { + int kNumLevels = 7; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.num_levels = kNumLevels; + Reopen(options); + + std::map true_data; + int file_id = 1; + // prevent range deletions from being dropped due to becoming obsolete. + const Snapshot* snapshot = db_->GetSnapshot(); + + // range del [0, 50) in L6 file, [50, 100) in L0 file, [100, 150) in memtable + for (int i = 0; i < 3; i++) { + if (i != 0) { + db_->Flush(FlushOptions()); + if (i == 1) { + MoveFilesToLevel(kNumLevels - 1); + } + } + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(50 * i), Key(50 * (i + 1)))); + } + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + ASSERT_EQ(0, NumTableFilesAtLevel(kNumLevels - 2)); + ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 1)); + + bool write_global_seqno = std::get<0>(GetParam()); + bool verify_checksums_before_ingest = std::get<1>(GetParam()); + // overlaps with L0 file but not memtable, so flush is skipped and file is + // ingested into L0 + SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber(); + ASSERT_OK(GenerateAndAddExternalFile( + options, {60, 90}, {ValueType::kTypeValue, ValueType::kTypeValue}, + {{65, 70}, {70, 85}}, file_id++, write_global_seqno, + verify_checksums_before_ingest, &true_data)); + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno); + ASSERT_EQ(2, NumTableFilesAtLevel(0)); + ASSERT_EQ(0, NumTableFilesAtLevel(kNumLevels - 2)); + ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1)); + + // overlaps with L6 file but not memtable or L0 file, so flush is skipped and + // file is ingested into L5 + ASSERT_OK(GenerateAndAddExternalFile( + options, {10, 40}, {ValueType::kTypeValue, ValueType::kTypeValue}, + file_id++, write_global_seqno, verify_checksums_before_ingest, + &true_data)); + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno); + ASSERT_EQ(2, NumTableFilesAtLevel(0)); + ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2)); + ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1)); + + // overlaps with L5 file but not memtable or L0 file, so flush is skipped and + // file is ingested into L4 + ASSERT_OK(GenerateAndAddExternalFile( + options, {}, {}, {{5, 15}}, file_id++, write_global_seqno, + verify_checksums_before_ingest, &true_data)); + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno); + ASSERT_EQ(2, NumTableFilesAtLevel(0)); + ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2)); + ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 2)); + ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1)); + + // ingested file overlaps with memtable, so flush is triggered before the file + // is ingested such that the ingested data is considered newest. So L0 file + // count increases by two. + ASSERT_OK(GenerateAndAddExternalFile( + options, {100, 140}, {ValueType::kTypeValue, ValueType::kTypeValue}, + file_id++, write_global_seqno, verify_checksums_before_ingest, + &true_data)); + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno); + ASSERT_EQ(4, NumTableFilesAtLevel(0)); + ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2)); + ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1)); + + // snapshot unneeded now that all range deletions are persisted + db_->ReleaseSnapshot(snapshot); + + // overlaps with nothing, so places at bottom level and skips incrementing + // seqnum. + ASSERT_OK(GenerateAndAddExternalFile( + options, {151, 175}, {ValueType::kTypeValue, ValueType::kTypeValue}, + {{160, 200}}, file_id++, write_global_seqno, + verify_checksums_before_ingest, &true_data)); + ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno); + ASSERT_EQ(4, NumTableFilesAtLevel(0)); + ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2)); + ASSERT_EQ(2, NumTableFilesAtLevel(options.num_levels - 1)); +} + +TEST_F(ExternalSSTFileBasicTest, AdjacentRangeDeletionTombstones) { + Options options = CurrentOptions(); + SstFileWriter sst_file_writer(EnvOptions(), options); + + // file8.sst (delete 300 => 400) + std::string file8 = sst_files_dir_ + "file8.sst"; + ASSERT_OK(sst_file_writer.Open(file8)); + ASSERT_OK(sst_file_writer.DeleteRange(Key(300), Key(400))); + ExternalSstFileInfo file8_info; + Status s = sst_file_writer.Finish(&file8_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file8_info.file_path, file8); + ASSERT_EQ(file8_info.num_entries, 0); + ASSERT_EQ(file8_info.smallest_key, ""); + ASSERT_EQ(file8_info.largest_key, ""); + ASSERT_EQ(file8_info.num_range_del_entries, 1); + ASSERT_EQ(file8_info.smallest_range_del_key, Key(300)); + ASSERT_EQ(file8_info.largest_range_del_key, Key(400)); + + // file9.sst (delete 400 => 500) + std::string file9 = sst_files_dir_ + "file9.sst"; + ASSERT_OK(sst_file_writer.Open(file9)); + ASSERT_OK(sst_file_writer.DeleteRange(Key(400), Key(500))); + ExternalSstFileInfo file9_info; + s = sst_file_writer.Finish(&file9_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file9_info.file_path, file9); + ASSERT_EQ(file9_info.num_entries, 0); + ASSERT_EQ(file9_info.smallest_key, ""); + ASSERT_EQ(file9_info.largest_key, ""); + ASSERT_EQ(file9_info.num_range_del_entries, 1); + ASSERT_EQ(file9_info.smallest_range_del_key, Key(400)); + ASSERT_EQ(file9_info.largest_range_del_key, Key(500)); + + // Range deletion tombstones are exclusive on their end key, so these SSTs + // should not be considered as overlapping. + s = DeprecatedAddFile({file8, file9}); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); + DestroyAndRecreateExternalSSTFilesDir(); +} + +TEST_P(ExternalSSTFileBasicTest, IngestFileWithBadBlockChecksum) { + bool change_checksum_called = false; + const auto& change_checksum = [&](void* arg) { + if (!change_checksum_called) { + char* buf = reinterpret_cast(arg); + assert(nullptr != buf); + buf[0] ^= 0x1; + change_checksum_called = true; + } + }; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTableBuilder::WriteRawBlock:TamperWithChecksum", + change_checksum); + SyncPoint::GetInstance()->EnableProcessing(); + int file_id = 0; + bool write_global_seqno = std::get<0>(GetParam()); + bool verify_checksums_before_ingest = std::get<1>(GetParam()); + do { + Options options = CurrentOptions(); + DestroyAndReopen(options); + std::map true_data; + Status s = GenerateAndAddExternalFile( + options, {1, 2, 3, 4, 5, 6}, ValueType::kTypeValue, file_id++, + write_global_seqno, verify_checksums_before_ingest, &true_data); + if (verify_checksums_before_ingest) { + ASSERT_NOK(s); + } else { + ASSERT_OK(s); + } + change_checksum_called = false; + } while (ChangeOptionsForFileIngestionTest()); +} + +TEST_P(ExternalSSTFileBasicTest, IngestFileWithFirstByteTampered) { + SyncPoint::GetInstance()->DisableProcessing(); + int file_id = 0; + EnvOptions env_options; + do { + Options options = CurrentOptions(); + std::string file_path = sst_files_dir_ + ToString(file_id++); + SstFileWriter sst_file_writer(env_options, options); + Status s = sst_file_writer.Open(file_path); + ASSERT_OK(s); + for (int i = 0; i != 100; ++i) { + std::string key = Key(i); + std::string value = Key(i) + ToString(0); + ASSERT_OK(sst_file_writer.Put(key, value)); + } + ASSERT_OK(sst_file_writer.Finish()); + { + // Get file size + uint64_t file_size = 0; + ASSERT_OK(env_->GetFileSize(file_path, &file_size)); + ASSERT_GT(file_size, 8); + std::unique_ptr rwfile; + ASSERT_OK(env_->NewRandomRWFile(file_path, &rwfile, EnvOptions())); + // Manually corrupt the file + // We deterministically corrupt the first byte because we currently + // cannot choose a random offset. The reason for this limitation is that + // we do not checksum property block at present. + const uint64_t offset = 0; + char scratch[8] = {0}; + Slice buf; + ASSERT_OK(rwfile->Read(offset, sizeof(scratch), &buf, scratch)); + scratch[0] ^= 0xff; // flip one bit + ASSERT_OK(rwfile->Write(offset, buf)); + } + // Ingest file. + IngestExternalFileOptions ifo; + ifo.write_global_seqno = std::get<0>(GetParam()); + ifo.verify_checksums_before_ingest = std::get<1>(GetParam()); + s = db_->IngestExternalFile({file_path}, ifo); + if (ifo.verify_checksums_before_ingest) { + ASSERT_NOK(s); + } else { + ASSERT_OK(s); + } + } while (ChangeOptionsForFileIngestionTest()); +} + +TEST_P(ExternalSSTFileBasicTest, IngestExternalFileWithCorruptedPropsBlock) { + bool verify_checksums_before_ingest = std::get<1>(GetParam()); + if (!verify_checksums_before_ingest) { + return; + } + uint64_t props_block_offset = 0; + size_t props_block_size = 0; + const auto& get_props_block_offset = [&](void* arg) { + props_block_offset = *reinterpret_cast(arg); + }; + const auto& get_props_block_size = [&](void* arg) { + props_block_size = *reinterpret_cast(arg); + }; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTableBuilder::WritePropertiesBlock:GetPropsBlockOffset", + get_props_block_offset); + SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTableBuilder::WritePropertiesBlock:GetPropsBlockSize", + get_props_block_size); + SyncPoint::GetInstance()->EnableProcessing(); + int file_id = 0; + Random64 rand(time(nullptr)); + do { + std::string file_path = sst_files_dir_ + ToString(file_id++); + Options options = CurrentOptions(); + SstFileWriter sst_file_writer(EnvOptions(), options); + Status s = sst_file_writer.Open(file_path); + ASSERT_OK(s); + for (int i = 0; i != 100; ++i) { + std::string key = Key(i); + std::string value = Key(i) + ToString(0); + ASSERT_OK(sst_file_writer.Put(key, value)); + } + ASSERT_OK(sst_file_writer.Finish()); + + { + std::unique_ptr rwfile; + ASSERT_OK(env_->NewRandomRWFile(file_path, &rwfile, EnvOptions())); + // Manually corrupt the file + ASSERT_GT(props_block_size, 8); + uint64_t offset = + props_block_offset + rand.Next() % (props_block_size - 8); + char scratch[8] = {0}; + Slice buf; + ASSERT_OK(rwfile->Read(offset, sizeof(scratch), &buf, scratch)); + scratch[0] ^= 0xff; // flip one bit + ASSERT_OK(rwfile->Write(offset, buf)); + } + + // Ingest file. + IngestExternalFileOptions ifo; + ifo.write_global_seqno = std::get<0>(GetParam()); + ifo.verify_checksums_before_ingest = true; + s = db_->IngestExternalFile({file_path}, ifo); + ASSERT_NOK(s); + } while (ChangeOptionsForFileIngestionTest()); +} + +TEST_F(ExternalSSTFileBasicTest, OverlappingFiles) { + Options options = CurrentOptions(); + + std::vector files; + { + SstFileWriter sst_file_writer(EnvOptions(), options); + std::string file1 = sst_files_dir_ + "file1.sst"; + ASSERT_OK(sst_file_writer.Open(file1)); + ASSERT_OK(sst_file_writer.Put("a", "z")); + ASSERT_OK(sst_file_writer.Put("i", "m")); + ExternalSstFileInfo file1_info; + ASSERT_OK(sst_file_writer.Finish(&file1_info)); + files.push_back(std::move(file1)); + } + { + SstFileWriter sst_file_writer(EnvOptions(), options); + std::string file2 = sst_files_dir_ + "file2.sst"; + ASSERT_OK(sst_file_writer.Open(file2)); + ASSERT_OK(sst_file_writer.Put("i", "k")); + ExternalSstFileInfo file2_info; + ASSERT_OK(sst_file_writer.Finish(&file2_info)); + files.push_back(std::move(file2)); + } + + IngestExternalFileOptions ifo; + ASSERT_OK(db_->IngestExternalFile(files, ifo)); + ASSERT_EQ(Get("a"), "z"); + ASSERT_EQ(Get("i"), "k"); + + int total_keys = 0; + Iterator* iter = db_->NewIterator(ReadOptions()); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + total_keys++; + } + delete iter; + ASSERT_EQ(total_keys, 2); + + ASSERT_EQ(2, NumTableFilesAtLevel(0)); +} + +INSTANTIATE_TEST_CASE_P(ExternalSSTFileBasicTest, ExternalSSTFileBasicTest, + testing::Values(std::make_tuple(true, true), + std::make_tuple(true, false), + std::make_tuple(false, true), + std::make_tuple(false, false))); + +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/external_sst_file_ingestion_job.cc b/src/rocksdb/db/external_sst_file_ingestion_job.cc new file mode 100644 index 000000000..4cec5d376 --- /dev/null +++ b/src/rocksdb/db/external_sst_file_ingestion_job.cc @@ -0,0 +1,731 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "db/external_sst_file_ingestion_job.h" + +#include +#include +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "db/version_edit.h" +#include "file/file_util.h" +#include "file/random_access_file_reader.h" +#include "table/merging_iterator.h" +#include "table/scoped_arena_iterator.h" +#include "table/sst_file_writer_collectors.h" +#include "table/table_builder.h" +#include "test_util/sync_point.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { + +Status ExternalSstFileIngestionJob::Prepare( + const std::vector& external_files_paths, + uint64_t next_file_number, SuperVersion* sv) { + Status status; + + // Read the information of files we are ingesting + for (const std::string& file_path : external_files_paths) { + IngestedFileInfo file_to_ingest; + status = GetIngestedFileInfo(file_path, &file_to_ingest, sv); + if (!status.ok()) { + return status; + } + files_to_ingest_.push_back(file_to_ingest); + } + + for (const IngestedFileInfo& f : files_to_ingest_) { + if (f.cf_id != + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily && + f.cf_id != cfd_->GetID()) { + return Status::InvalidArgument( + "External file column family id dont match"); + } + } + + const Comparator* ucmp = cfd_->internal_comparator().user_comparator(); + auto num_files = files_to_ingest_.size(); + if (num_files == 0) { + return Status::InvalidArgument("The list of files is empty"); + } else if (num_files > 1) { + // Verify that passed files dont have overlapping ranges + autovector sorted_files; + for (size_t i = 0; i < num_files; i++) { + sorted_files.push_back(&files_to_ingest_[i]); + } + + std::sort( + sorted_files.begin(), sorted_files.end(), + [&ucmp](const IngestedFileInfo* info1, const IngestedFileInfo* info2) { + return sstableKeyCompare(ucmp, info1->smallest_internal_key, + info2->smallest_internal_key) < 0; + }); + + for (size_t i = 0; i < num_files - 1; i++) { + if (sstableKeyCompare(ucmp, sorted_files[i]->largest_internal_key, + sorted_files[i + 1]->smallest_internal_key) >= 0) { + files_overlap_ = true; + break; + } + } + } + + if (ingestion_options_.ingest_behind && files_overlap_) { + return Status::NotSupported("Files have overlapping ranges"); + } + + for (IngestedFileInfo& f : files_to_ingest_) { + if (f.num_entries == 0 && f.num_range_deletions == 0) { + return Status::InvalidArgument("File contain no entries"); + } + + if (!f.smallest_internal_key.Valid() || !f.largest_internal_key.Valid()) { + return Status::Corruption("Generated table have corrupted keys"); + } + } + + // Copy/Move external files into DB + std::unordered_set ingestion_path_ids; + for (IngestedFileInfo& f : files_to_ingest_) { + f.fd = FileDescriptor(next_file_number++, 0, f.file_size); + f.copy_file = false; + const std::string path_outside_db = f.external_file_path; + const std::string path_inside_db = + TableFileName(cfd_->ioptions()->cf_paths, f.fd.GetNumber(), + f.fd.GetPathId()); + if (ingestion_options_.move_files) { + status = + fs_->LinkFile(path_outside_db, path_inside_db, IOOptions(), nullptr); + if (status.ok()) { + // It is unsafe to assume application had sync the file and file + // directory before ingest the file. For integrity of RocksDB we need + // to sync the file. + std::unique_ptr file_to_sync; + status = fs_->ReopenWritableFile(path_inside_db, env_options_, + &file_to_sync, nullptr); + if (status.ok()) { + TEST_SYNC_POINT( + "ExternalSstFileIngestionJob::BeforeSyncIngestedFile"); + status = SyncIngestedFile(file_to_sync.get()); + TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncIngestedFile"); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to sync ingested file %s: %s", + path_inside_db.c_str(), status.ToString().c_str()); + } + } + } else if (status.IsNotSupported() && + ingestion_options_.failed_move_fall_back_to_copy) { + // Original file is on a different FS, use copy instead of hard linking. + f.copy_file = true; + } + } else { + f.copy_file = true; + } + + if (f.copy_file) { + TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:CopyFile", + nullptr); + // CopyFile also sync the new file. + status = CopyFile(fs_, path_outside_db, path_inside_db, 0, + db_options_.use_fsync); + } + TEST_SYNC_POINT("ExternalSstFileIngestionJob::Prepare:FileAdded"); + if (!status.ok()) { + break; + } + f.internal_file_path = path_inside_db; + ingestion_path_ids.insert(f.fd.GetPathId()); + } + + TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncDir"); + if (status.ok()) { + for (auto path_id : ingestion_path_ids) { + status = directories_->GetDataDir(path_id)->Fsync(); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to sync directory %" ROCKSDB_PRIszt + " while ingest file: %s", + path_id, status.ToString().c_str()); + break; + } + } + } + TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncDir"); + + // TODO: The following is duplicated with Cleanup(). + if (!status.ok()) { + // We failed, remove all files that we copied into the db + for (IngestedFileInfo& f : files_to_ingest_) { + if (f.internal_file_path.empty()) { + continue; + } + Status s = env_->DeleteFile(f.internal_file_path); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "AddFile() clean up for file %s failed : %s", + f.internal_file_path.c_str(), s.ToString().c_str()); + } + } + } + + return status; +} + +Status ExternalSstFileIngestionJob::NeedsFlush(bool* flush_needed, + SuperVersion* super_version) { + autovector ranges; + for (const IngestedFileInfo& file_to_ingest : files_to_ingest_) { + ranges.emplace_back(file_to_ingest.smallest_internal_key.user_key(), + file_to_ingest.largest_internal_key.user_key()); + } + Status status = + cfd_->RangesOverlapWithMemtables(ranges, super_version, flush_needed); + if (status.ok() && *flush_needed && + !ingestion_options_.allow_blocking_flush) { + status = Status::InvalidArgument("External file requires flush"); + } + return status; +} + +// REQUIRES: we have become the only writer by entering both write_thread_ and +// nonmem_write_thread_ +Status ExternalSstFileIngestionJob::Run() { + Status status; + SuperVersion* super_version = cfd_->GetSuperVersion(); +#ifndef NDEBUG + // We should never run the job with a memtable that is overlapping + // with the files we are ingesting + bool need_flush = false; + status = NeedsFlush(&need_flush, super_version); + assert(status.ok() && need_flush == false); +#endif + + bool force_global_seqno = false; + + if (ingestion_options_.snapshot_consistency && !db_snapshots_->empty()) { + // We need to assign a global sequence number to all the files even + // if the dont overlap with any ranges since we have snapshots + force_global_seqno = true; + } + // It is safe to use this instead of LastAllocatedSequence since we are + // the only active writer, and hence they are equal + SequenceNumber last_seqno = versions_->LastSequence(); + edit_.SetColumnFamily(cfd_->GetID()); + // The levels that the files will be ingested into + + for (IngestedFileInfo& f : files_to_ingest_) { + SequenceNumber assigned_seqno = 0; + if (ingestion_options_.ingest_behind) { + status = CheckLevelForIngestedBehindFile(&f); + } else { + status = AssignLevelAndSeqnoForIngestedFile( + super_version, force_global_seqno, cfd_->ioptions()->compaction_style, + last_seqno, &f, &assigned_seqno); + } + if (!status.ok()) { + return status; + } + status = AssignGlobalSeqnoForIngestedFile(&f, assigned_seqno); + TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Run", + &assigned_seqno); + if (assigned_seqno > last_seqno) { + assert(assigned_seqno == last_seqno + 1); + last_seqno = assigned_seqno; + ++consumed_seqno_count_; + } + if (!status.ok()) { + return status; + } + + // We use the import time as the ancester time. This is the time the data + // is written to the database. + int64_t temp_current_time = 0; + uint64_t current_time = kUnknownFileCreationTime; + uint64_t oldest_ancester_time = kUnknownOldestAncesterTime; + if (env_->GetCurrentTime(&temp_current_time).ok()) { + current_time = oldest_ancester_time = + static_cast(temp_current_time); + } + + edit_.AddFile( + f.picked_level, f.fd.GetNumber(), f.fd.GetPathId(), f.fd.GetFileSize(), + f.smallest_internal_key, f.largest_internal_key, f.assigned_seqno, + f.assigned_seqno, false, kInvalidBlobFileNumber, oldest_ancester_time, + current_time, kUnknownFileChecksum, kUnknownFileChecksumFuncName); + } + return status; +} + +void ExternalSstFileIngestionJob::UpdateStats() { + // Update internal stats for new ingested files + uint64_t total_keys = 0; + uint64_t total_l0_files = 0; + uint64_t total_time = env_->NowMicros() - job_start_time_; + + EventLoggerStream stream = event_logger_->Log(); + stream << "event" + << "ingest_finished"; + stream << "files_ingested"; + stream.StartArray(); + + for (IngestedFileInfo& f : files_to_ingest_) { + InternalStats::CompactionStats stats(CompactionReason::kExternalSstIngestion, 1); + stats.micros = total_time; + // If actual copy occurred for this file, then we need to count the file + // size as the actual bytes written. If the file was linked, then we ignore + // the bytes written for file metadata. + // TODO (yanqin) maybe account for file metadata bytes for exact accuracy? + if (f.copy_file) { + stats.bytes_written = f.fd.GetFileSize(); + } else { + stats.bytes_moved = f.fd.GetFileSize(); + } + stats.num_output_files = 1; + cfd_->internal_stats()->AddCompactionStats(f.picked_level, + Env::Priority::USER, stats); + cfd_->internal_stats()->AddCFStats(InternalStats::BYTES_INGESTED_ADD_FILE, + f.fd.GetFileSize()); + total_keys += f.num_entries; + if (f.picked_level == 0) { + total_l0_files += 1; + } + ROCKS_LOG_INFO( + db_options_.info_log, + "[AddFile] External SST file %s was ingested in L%d with path %s " + "(global_seqno=%" PRIu64 ")\n", + f.external_file_path.c_str(), f.picked_level, + f.internal_file_path.c_str(), f.assigned_seqno); + stream << "file" << f.internal_file_path << "level" << f.picked_level; + } + stream.EndArray(); + + stream << "lsm_state"; + stream.StartArray(); + auto vstorage = cfd_->current()->storage_info(); + for (int level = 0; level < vstorage->num_levels(); ++level) { + stream << vstorage->NumLevelFiles(level); + } + stream.EndArray(); + + cfd_->internal_stats()->AddCFStats(InternalStats::INGESTED_NUM_KEYS_TOTAL, + total_keys); + cfd_->internal_stats()->AddCFStats(InternalStats::INGESTED_NUM_FILES_TOTAL, + files_to_ingest_.size()); + cfd_->internal_stats()->AddCFStats( + InternalStats::INGESTED_LEVEL0_NUM_FILES_TOTAL, total_l0_files); +} + +void ExternalSstFileIngestionJob::Cleanup(const Status& status) { + if (!status.ok()) { + // We failed to add the files to the database + // remove all the files we copied + for (IngestedFileInfo& f : files_to_ingest_) { + if (f.internal_file_path.empty()) { + continue; + } + Status s = env_->DeleteFile(f.internal_file_path); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "AddFile() clean up for file %s failed : %s", + f.internal_file_path.c_str(), s.ToString().c_str()); + } + } + consumed_seqno_count_ = 0; + files_overlap_ = false; + } else if (status.ok() && ingestion_options_.move_files) { + // The files were moved and added successfully, remove original file links + for (IngestedFileInfo& f : files_to_ingest_) { + Status s = env_->DeleteFile(f.external_file_path); + if (!s.ok()) { + ROCKS_LOG_WARN( + db_options_.info_log, + "%s was added to DB successfully but failed to remove original " + "file link : %s", + f.external_file_path.c_str(), s.ToString().c_str()); + } + } + } +} + +Status ExternalSstFileIngestionJob::GetIngestedFileInfo( + const std::string& external_file, IngestedFileInfo* file_to_ingest, + SuperVersion* sv) { + file_to_ingest->external_file_path = external_file; + + // Get external file size + Status status = fs_->GetFileSize(external_file, IOOptions(), + &file_to_ingest->file_size, nullptr); + if (!status.ok()) { + return status; + } + + // Create TableReader for external file + std::unique_ptr table_reader; + std::unique_ptr sst_file; + std::unique_ptr sst_file_reader; + + status = fs_->NewRandomAccessFile(external_file, env_options_, + &sst_file, nullptr); + if (!status.ok()) { + return status; + } + sst_file_reader.reset(new RandomAccessFileReader(std::move(sst_file), + external_file)); + + status = cfd_->ioptions()->table_factory->NewTableReader( + TableReaderOptions(*cfd_->ioptions(), + sv->mutable_cf_options.prefix_extractor.get(), + env_options_, cfd_->internal_comparator()), + std::move(sst_file_reader), file_to_ingest->file_size, &table_reader); + if (!status.ok()) { + return status; + } + + if (ingestion_options_.verify_checksums_before_ingest) { + // If customized readahead size is needed, we can pass a user option + // all the way to here. Right now we just rely on the default readahead + // to keep things simple. + ReadOptions ro; + ro.readahead_size = ingestion_options_.verify_checksums_readahead_size; + status = table_reader->VerifyChecksum( + ro, TableReaderCaller::kExternalSSTIngestion); + } + if (!status.ok()) { + return status; + } + + // Get the external file properties + auto props = table_reader->GetTableProperties(); + const auto& uprops = props->user_collected_properties; + + // Get table version + auto version_iter = uprops.find(ExternalSstFilePropertyNames::kVersion); + if (version_iter == uprops.end()) { + return Status::Corruption("External file version not found"); + } + file_to_ingest->version = DecodeFixed32(version_iter->second.c_str()); + + auto seqno_iter = uprops.find(ExternalSstFilePropertyNames::kGlobalSeqno); + if (file_to_ingest->version == 2) { + // version 2 imply that we have global sequence number + if (seqno_iter == uprops.end()) { + return Status::Corruption( + "External file global sequence number not found"); + } + + // Set the global sequence number + file_to_ingest->original_seqno = DecodeFixed64(seqno_iter->second.c_str()); + auto offsets_iter = props->properties_offsets.find( + ExternalSstFilePropertyNames::kGlobalSeqno); + if (offsets_iter == props->properties_offsets.end() || + offsets_iter->second == 0) { + file_to_ingest->global_seqno_offset = 0; + return Status::Corruption("Was not able to find file global seqno field"); + } + file_to_ingest->global_seqno_offset = static_cast(offsets_iter->second); + } else if (file_to_ingest->version == 1) { + // SST file V1 should not have global seqno field + assert(seqno_iter == uprops.end()); + file_to_ingest->original_seqno = 0; + if (ingestion_options_.allow_blocking_flush || + ingestion_options_.allow_global_seqno) { + return Status::InvalidArgument( + "External SST file V1 does not support global seqno"); + } + } else { + return Status::InvalidArgument("External file version is not supported"); + } + // Get number of entries in table + file_to_ingest->num_entries = props->num_entries; + file_to_ingest->num_range_deletions = props->num_range_deletions; + + ParsedInternalKey key; + ReadOptions ro; + // During reading the external file we can cache blocks that we read into + // the block cache, if we later change the global seqno of this file, we will + // have block in cache that will include keys with wrong seqno. + // We need to disable fill_cache so that we read from the file without + // updating the block cache. + ro.fill_cache = false; + std::unique_ptr iter(table_reader->NewIterator( + ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion)); + std::unique_ptr range_del_iter( + table_reader->NewRangeTombstoneIterator(ro)); + + // Get first (smallest) and last (largest) key from file. + file_to_ingest->smallest_internal_key = + InternalKey("", 0, ValueType::kTypeValue); + file_to_ingest->largest_internal_key = + InternalKey("", 0, ValueType::kTypeValue); + bool bounds_set = false; + iter->SeekToFirst(); + if (iter->Valid()) { + if (!ParseInternalKey(iter->key(), &key)) { + return Status::Corruption("external file have corrupted keys"); + } + if (key.sequence != 0) { + return Status::Corruption("external file have non zero sequence number"); + } + file_to_ingest->smallest_internal_key.SetFrom(key); + + iter->SeekToLast(); + if (!ParseInternalKey(iter->key(), &key)) { + return Status::Corruption("external file have corrupted keys"); + } + if (key.sequence != 0) { + return Status::Corruption("external file have non zero sequence number"); + } + file_to_ingest->largest_internal_key.SetFrom(key); + + bounds_set = true; + } + + // We may need to adjust these key bounds, depending on whether any range + // deletion tombstones extend past them. + const Comparator* ucmp = cfd_->internal_comparator().user_comparator(); + if (range_del_iter != nullptr) { + for (range_del_iter->SeekToFirst(); range_del_iter->Valid(); + range_del_iter->Next()) { + if (!ParseInternalKey(range_del_iter->key(), &key)) { + return Status::Corruption("external file have corrupted keys"); + } + RangeTombstone tombstone(key, range_del_iter->value()); + + InternalKey start_key = tombstone.SerializeKey(); + if (!bounds_set || + sstableKeyCompare(ucmp, start_key, + file_to_ingest->smallest_internal_key) < 0) { + file_to_ingest->smallest_internal_key = start_key; + } + InternalKey end_key = tombstone.SerializeEndKey(); + if (!bounds_set || + sstableKeyCompare(ucmp, end_key, + file_to_ingest->largest_internal_key) > 0) { + file_to_ingest->largest_internal_key = end_key; + } + bounds_set = true; + } + } + + file_to_ingest->cf_id = static_cast(props->column_family_id); + + file_to_ingest->table_properties = *props; + + return status; +} + +Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile( + SuperVersion* sv, bool force_global_seqno, CompactionStyle compaction_style, + SequenceNumber last_seqno, IngestedFileInfo* file_to_ingest, + SequenceNumber* assigned_seqno) { + Status status; + *assigned_seqno = 0; + if (force_global_seqno) { + *assigned_seqno = last_seqno + 1; + if (compaction_style == kCompactionStyleUniversal || files_overlap_) { + file_to_ingest->picked_level = 0; + return status; + } + } + + bool overlap_with_db = false; + Arena arena; + ReadOptions ro; + ro.total_order_seek = true; + int target_level = 0; + auto* vstorage = cfd_->current()->storage_info(); + + for (int lvl = 0; lvl < cfd_->NumberLevels(); lvl++) { + if (lvl > 0 && lvl < vstorage->base_level()) { + continue; + } + + if (vstorage->NumLevelFiles(lvl) > 0) { + bool overlap_with_level = false; + status = sv->current->OverlapWithLevelIterator( + ro, env_options_, file_to_ingest->smallest_internal_key.user_key(), + file_to_ingest->largest_internal_key.user_key(), lvl, + &overlap_with_level); + if (!status.ok()) { + return status; + } + if (overlap_with_level) { + // We must use L0 or any level higher than `lvl` to be able to overwrite + // the keys that we overlap with in this level, We also need to assign + // this file a seqno to overwrite the existing keys in level `lvl` + overlap_with_db = true; + break; + } + + if (compaction_style == kCompactionStyleUniversal && lvl != 0) { + const std::vector& level_files = + vstorage->LevelFiles(lvl); + const SequenceNumber level_largest_seqno = + (*max_element(level_files.begin(), level_files.end(), + [](FileMetaData* f1, FileMetaData* f2) { + return f1->fd.largest_seqno < f2->fd.largest_seqno; + })) + ->fd.largest_seqno; + // should only assign seqno to current level's largest seqno when + // the file fits + if (level_largest_seqno != 0 && + IngestedFileFitInLevel(file_to_ingest, lvl)) { + *assigned_seqno = level_largest_seqno; + } else { + continue; + } + } + } else if (compaction_style == kCompactionStyleUniversal) { + continue; + } + + // We dont overlap with any keys in this level, but we still need to check + // if our file can fit in it + if (IngestedFileFitInLevel(file_to_ingest, lvl)) { + target_level = lvl; + } + } + // If files overlap, we have to ingest them at level 0 and assign the newest + // sequence number + if (files_overlap_) { + target_level = 0; + *assigned_seqno = last_seqno + 1; + } + TEST_SYNC_POINT_CALLBACK( + "ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile", + &overlap_with_db); + file_to_ingest->picked_level = target_level; + if (overlap_with_db && *assigned_seqno == 0) { + *assigned_seqno = last_seqno + 1; + } + return status; +} + +Status ExternalSstFileIngestionJob::CheckLevelForIngestedBehindFile( + IngestedFileInfo* file_to_ingest) { + auto* vstorage = cfd_->current()->storage_info(); + // first check if new files fit in the bottommost level + int bottom_lvl = cfd_->NumberLevels() - 1; + if(!IngestedFileFitInLevel(file_to_ingest, bottom_lvl)) { + return Status::InvalidArgument( + "Can't ingest_behind file as it doesn't fit " + "at the bottommost level!"); + } + + // second check if despite allow_ingest_behind=true we still have 0 seqnums + // at some upper level + for (int lvl = 0; lvl < cfd_->NumberLevels() - 1; lvl++) { + for (auto file : vstorage->LevelFiles(lvl)) { + if (file->fd.smallest_seqno == 0) { + return Status::InvalidArgument( + "Can't ingest_behind file as despite allow_ingest_behind=true " + "there are files with 0 seqno in database at upper levels!"); + } + } + } + + file_to_ingest->picked_level = bottom_lvl; + return Status::OK(); +} + +Status ExternalSstFileIngestionJob::AssignGlobalSeqnoForIngestedFile( + IngestedFileInfo* file_to_ingest, SequenceNumber seqno) { + if (file_to_ingest->original_seqno == seqno) { + // This file already have the correct global seqno + return Status::OK(); + } else if (!ingestion_options_.allow_global_seqno) { + return Status::InvalidArgument("Global seqno is required, but disabled"); + } else if (file_to_ingest->global_seqno_offset == 0) { + return Status::InvalidArgument( + "Trying to set global seqno for a file that dont have a global seqno " + "field"); + } + + if (ingestion_options_.write_global_seqno) { + // Determine if we can write global_seqno to a given offset of file. + // If the file system does not support random write, then we should not. + // Otherwise we should. + std::unique_ptr rwfile; + Status status = + fs_->NewRandomRWFile(file_to_ingest->internal_file_path, env_options_, + &rwfile, nullptr); + if (status.ok()) { + std::string seqno_val; + PutFixed64(&seqno_val, seqno); + status = rwfile->Write(file_to_ingest->global_seqno_offset, seqno_val, + IOOptions(), nullptr); + if (status.ok()) { + TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncGlobalSeqno"); + status = SyncIngestedFile(rwfile.get()); + TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncGlobalSeqno"); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to sync ingested file %s after writing global " + "sequence number: %s", + file_to_ingest->internal_file_path.c_str(), + status.ToString().c_str()); + } + } + if (!status.ok()) { + return status; + } + } else if (!status.IsNotSupported()) { + return status; + } + } + + file_to_ingest->assigned_seqno = seqno; + return Status::OK(); +} + +bool ExternalSstFileIngestionJob::IngestedFileFitInLevel( + const IngestedFileInfo* file_to_ingest, int level) { + if (level == 0) { + // Files can always fit in L0 + return true; + } + + auto* vstorage = cfd_->current()->storage_info(); + Slice file_smallest_user_key( + file_to_ingest->smallest_internal_key.user_key()); + Slice file_largest_user_key(file_to_ingest->largest_internal_key.user_key()); + + if (vstorage->OverlapInLevel(level, &file_smallest_user_key, + &file_largest_user_key)) { + // File overlap with another files in this level, we cannot + // add it to this level + return false; + } + if (cfd_->RangeOverlapWithCompaction(file_smallest_user_key, + file_largest_user_key, level)) { + // File overlap with a running compaction output that will be stored + // in this level, we cannot add this file to this level + return false; + } + + // File did not overlap with level files, our compaction output + return true; +} + +template +Status ExternalSstFileIngestionJob::SyncIngestedFile(TWritableFile* file) { + assert(file != nullptr); + if (db_options_.use_fsync) { + return file->Fsync(IOOptions(), nullptr); + } else { + return file->Sync(IOOptions(), nullptr); + } +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/external_sst_file_ingestion_job.h b/src/rocksdb/db/external_sst_file_ingestion_job.h new file mode 100644 index 000000000..7ddb6f3e8 --- /dev/null +++ b/src/rocksdb/db/external_sst_file_ingestion_job.h @@ -0,0 +1,180 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include +#include +#include + +#include "db/column_family.h" +#include "db/dbformat.h" +#include "db/internal_stats.h" +#include "db/snapshot_impl.h" +#include "logging/event_logger.h" +#include "options/db_options.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/sst_file_writer.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +class Directories; + +struct IngestedFileInfo { + // External file path + std::string external_file_path; + // Smallest internal key in external file + InternalKey smallest_internal_key; + // Largest internal key in external file + InternalKey largest_internal_key; + // Sequence number for keys in external file + SequenceNumber original_seqno; + // Offset of the global sequence number field in the file, will + // be zero if version is 1 (global seqno is not supported) + size_t global_seqno_offset; + // External file size + uint64_t file_size; + // total number of keys in external file + uint64_t num_entries; + // total number of range deletions in external file + uint64_t num_range_deletions; + // Id of column family this file shoule be ingested into + uint32_t cf_id; + // TableProperties read from external file + TableProperties table_properties; + // Version of external file + int version; + + // FileDescriptor for the file inside the DB + FileDescriptor fd; + // file path that we picked for file inside the DB + std::string internal_file_path; + // Global sequence number that we picked for the file inside the DB + SequenceNumber assigned_seqno = 0; + // Level inside the DB we picked for the external file. + int picked_level = 0; + // Whether to copy or link the external sst file. copy_file will be set to + // false if ingestion_options.move_files is true and underlying FS + // supports link operation. Need to provide a default value to make the + // undefined-behavior sanity check of llvm happy. Since + // ingestion_options.move_files is false by default, thus copy_file is true + // by default. + bool copy_file = true; +}; + +class ExternalSstFileIngestionJob { + public: + ExternalSstFileIngestionJob( + Env* env, VersionSet* versions, ColumnFamilyData* cfd, + const ImmutableDBOptions& db_options, const EnvOptions& env_options, + SnapshotList* db_snapshots, + const IngestExternalFileOptions& ingestion_options, + Directories* directories, EventLogger* event_logger) + : env_(env), + fs_(db_options.fs.get()), + versions_(versions), + cfd_(cfd), + db_options_(db_options), + env_options_(env_options), + db_snapshots_(db_snapshots), + ingestion_options_(ingestion_options), + directories_(directories), + event_logger_(event_logger), + job_start_time_(env_->NowMicros()), + consumed_seqno_count_(0) { + assert(directories != nullptr); + } + + // Prepare the job by copying external files into the DB. + Status Prepare(const std::vector& external_files_paths, + uint64_t next_file_number, SuperVersion* sv); + + // Check if we need to flush the memtable before running the ingestion job + // This will be true if the files we are ingesting are overlapping with any + // key range in the memtable. + // + // @param super_version A referenced SuperVersion that will be held for the + // duration of this function. + // + // Thread-safe + Status NeedsFlush(bool* flush_needed, SuperVersion* super_version); + + // Will execute the ingestion job and prepare edit() to be applied. + // REQUIRES: Mutex held + Status Run(); + + // Update column family stats. + // REQUIRES: Mutex held + void UpdateStats(); + + // Cleanup after successful/failed job + void Cleanup(const Status& status); + + VersionEdit* edit() { return &edit_; } + + const autovector& files_to_ingest() const { + return files_to_ingest_; + } + + // How many sequence numbers did we consume as part of the ingest job? + int ConsumedSequenceNumbersCount() const { return consumed_seqno_count_; } + + private: + // Open the external file and populate `file_to_ingest` with all the + // external information we need to ingest this file. + Status GetIngestedFileInfo(const std::string& external_file, + IngestedFileInfo* file_to_ingest, + SuperVersion* sv); + + // Assign `file_to_ingest` the appropriate sequence number and the lowest + // possible level that it can be ingested to according to compaction_style. + // REQUIRES: Mutex held + Status AssignLevelAndSeqnoForIngestedFile(SuperVersion* sv, + bool force_global_seqno, + CompactionStyle compaction_style, + SequenceNumber last_seqno, + IngestedFileInfo* file_to_ingest, + SequenceNumber* assigned_seqno); + + // File that we want to ingest behind always goes to the lowest level; + // we just check that it fits in the level, that DB allows ingest_behind, + // and that we don't have 0 seqnums at the upper levels. + // REQUIRES: Mutex held + Status CheckLevelForIngestedBehindFile(IngestedFileInfo* file_to_ingest); + + // Set the file global sequence number to `seqno` + Status AssignGlobalSeqnoForIngestedFile(IngestedFileInfo* file_to_ingest, + SequenceNumber seqno); + + // Check if `file_to_ingest` can fit in level `level` + // REQUIRES: Mutex held + bool IngestedFileFitInLevel(const IngestedFileInfo* file_to_ingest, + int level); + + // Helper method to sync given file. + template + Status SyncIngestedFile(TWritableFile* file); + + Env* env_; + FileSystem* fs_; + VersionSet* versions_; + ColumnFamilyData* cfd_; + const ImmutableDBOptions& db_options_; + const EnvOptions& env_options_; + SnapshotList* db_snapshots_; + autovector files_to_ingest_; + const IngestExternalFileOptions& ingestion_options_; + Directories* directories_; + EventLogger* event_logger_; + VersionEdit edit_; + uint64_t job_start_time_; + int consumed_seqno_count_; + // Set in ExternalSstFileIngestionJob::Prepare(), if true all files are + // ingested in L0 + bool files_overlap_{false}; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/external_sst_file_test.cc b/src/rocksdb/db/external_sst_file_test.cc new file mode 100644 index 000000000..0b91910a1 --- /dev/null +++ b/src/rocksdb/db/external_sst_file_test.cc @@ -0,0 +1,2832 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include +#include "db/db_test_util.h" +#include "file/filename.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/sst_file_writer.h" +#include "test_util/fault_injection_test_env.h" +#include "test_util/testutil.h" + +namespace ROCKSDB_NAMESPACE { + +// A test environment that can be configured to fail the Link operation. +class ExternalSSTTestEnv : public EnvWrapper { + public: + ExternalSSTTestEnv(Env* t, bool fail_link) + : EnvWrapper(t), fail_link_(fail_link) {} + + Status LinkFile(const std::string& s, const std::string& t) override { + if (fail_link_) { + return Status::NotSupported("Link failed"); + } + return target()->LinkFile(s, t); + } + + void set_fail_link(bool fail_link) { fail_link_ = fail_link; } + + private: + bool fail_link_; +}; + +class ExternSSTFileLinkFailFallbackTest + : public DBTestBase, + public ::testing::WithParamInterface> { + public: + ExternSSTFileLinkFailFallbackTest() + : DBTestBase("/external_sst_file_test"), + test_env_(new ExternalSSTTestEnv(env_, true)) { + sst_files_dir_ = dbname_ + "/sst_files/"; + test::DestroyDir(env_, sst_files_dir_); + env_->CreateDir(sst_files_dir_); + options_ = CurrentOptions(); + options_.disable_auto_compactions = true; + options_.env = test_env_; + } + + void TearDown() override { + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, options_)); + delete test_env_; + test_env_ = nullptr; + } + + protected: + std::string sst_files_dir_; + Options options_; + ExternalSSTTestEnv* test_env_; +}; + +class ExternalSSTFileTest + : public DBTestBase, + public ::testing::WithParamInterface> { + public: + ExternalSSTFileTest() : DBTestBase("/external_sst_file_test") { + sst_files_dir_ = dbname_ + "/sst_files/"; + DestroyAndRecreateExternalSSTFilesDir(); + } + + void DestroyAndRecreateExternalSSTFilesDir() { + test::DestroyDir(env_, sst_files_dir_); + env_->CreateDir(sst_files_dir_); + } + + Status GenerateOneExternalFile( + const Options& options, ColumnFamilyHandle* cfh, + std::vector>& data, int file_id, + bool sort_data, std::string* external_file_path, + std::map* true_data) { + // Generate a file id if not provided + if (-1 == file_id) { + file_id = (++last_file_id_); + } + // Sort data if asked to do so + if (sort_data) { + std::sort(data.begin(), data.end(), + [&](const std::pair& e1, + const std::pair& e2) { + return options.comparator->Compare(e1.first, e2.first) < 0; + }); + auto uniq_iter = std::unique( + data.begin(), data.end(), + [&](const std::pair& e1, + const std::pair& e2) { + return options.comparator->Compare(e1.first, e2.first) == 0; + }); + data.resize(uniq_iter - data.begin()); + } + std::string file_path = sst_files_dir_ + ToString(file_id); + SstFileWriter sst_file_writer(EnvOptions(), options, cfh); + Status s = sst_file_writer.Open(file_path); + if (!s.ok()) { + return s; + } + for (const auto& entry : data) { + s = sst_file_writer.Put(entry.first, entry.second); + if (!s.ok()) { + sst_file_writer.Finish(); + return s; + } + } + s = sst_file_writer.Finish(); + if (s.ok() && external_file_path != nullptr) { + *external_file_path = file_path; + } + if (s.ok() && nullptr != true_data) { + for (const auto& entry : data) { + true_data->insert({entry.first, entry.second}); + } + } + return s; + } + + Status GenerateAndAddExternalFile( + const Options options, + std::vector> data, int file_id = -1, + bool allow_global_seqno = false, bool write_global_seqno = false, + bool verify_checksums_before_ingest = true, bool ingest_behind = false, + bool sort_data = false, + std::map* true_data = nullptr, + ColumnFamilyHandle* cfh = nullptr) { + // Generate a file id if not provided + if (file_id == -1) { + file_id = last_file_id_ + 1; + last_file_id_++; + } + + // Sort data if asked to do so + if (sort_data) { + std::sort(data.begin(), data.end(), + [&](const std::pair& e1, + const std::pair& e2) { + return options.comparator->Compare(e1.first, e2.first) < 0; + }); + auto uniq_iter = std::unique( + data.begin(), data.end(), + [&](const std::pair& e1, + const std::pair& e2) { + return options.comparator->Compare(e1.first, e2.first) == 0; + }); + data.resize(uniq_iter - data.begin()); + } + std::string file_path = sst_files_dir_ + ToString(file_id); + SstFileWriter sst_file_writer(EnvOptions(), options, cfh); + + Status s = sst_file_writer.Open(file_path); + if (!s.ok()) { + return s; + } + for (auto& entry : data) { + s = sst_file_writer.Put(entry.first, entry.second); + if (!s.ok()) { + sst_file_writer.Finish(); + return s; + } + } + s = sst_file_writer.Finish(); + + if (s.ok()) { + IngestExternalFileOptions ifo; + ifo.allow_global_seqno = allow_global_seqno; + ifo.write_global_seqno = allow_global_seqno ? write_global_seqno : false; + ifo.verify_checksums_before_ingest = verify_checksums_before_ingest; + ifo.ingest_behind = ingest_behind; + if (cfh) { + s = db_->IngestExternalFile(cfh, {file_path}, ifo); + } else { + s = db_->IngestExternalFile({file_path}, ifo); + } + } + + if (s.ok() && true_data) { + for (auto& entry : data) { + (*true_data)[entry.first] = entry.second; + } + } + + return s; + } + + Status GenerateAndAddExternalFiles( + const Options& options, + const std::vector& column_families, + const std::vector& ifos, + std::vector>>& data, + int file_id, bool sort_data, + std::vector>& true_data) { + if (-1 == file_id) { + file_id = (++last_file_id_); + } + // Generate external SST files, one for each column family + size_t num_cfs = column_families.size(); + assert(ifos.size() == num_cfs); + assert(data.size() == num_cfs); + Status s; + std::vector args(num_cfs); + for (size_t i = 0; i != num_cfs; ++i) { + std::string external_file_path; + s = GenerateOneExternalFile( + options, column_families[i], data[i], file_id, sort_data, + &external_file_path, + true_data.size() == num_cfs ? &true_data[i] : nullptr); + if (!s.ok()) { + return s; + } + ++file_id; + + args[i].column_family = column_families[i]; + args[i].external_files.push_back(external_file_path); + args[i].options = ifos[i]; + } + s = db_->IngestExternalFiles(args); + return s; + } + + Status GenerateAndAddExternalFile( + const Options options, std::vector> data, + int file_id = -1, bool allow_global_seqno = false, + bool write_global_seqno = false, + bool verify_checksums_before_ingest = true, bool ingest_behind = false, + bool sort_data = false, + std::map* true_data = nullptr, + ColumnFamilyHandle* cfh = nullptr) { + std::vector> file_data; + for (auto& entry : data) { + file_data.emplace_back(Key(entry.first), entry.second); + } + return GenerateAndAddExternalFile(options, file_data, file_id, + allow_global_seqno, write_global_seqno, + verify_checksums_before_ingest, + ingest_behind, sort_data, true_data, cfh); + } + + Status GenerateAndAddExternalFile( + const Options options, std::vector keys, int file_id = -1, + bool allow_global_seqno = false, bool write_global_seqno = false, + bool verify_checksums_before_ingest = true, bool ingest_behind = false, + bool sort_data = false, + std::map* true_data = nullptr, + ColumnFamilyHandle* cfh = nullptr) { + std::vector> file_data; + for (auto& k : keys) { + file_data.emplace_back(Key(k), Key(k) + ToString(file_id)); + } + return GenerateAndAddExternalFile(options, file_data, file_id, + allow_global_seqno, write_global_seqno, + verify_checksums_before_ingest, + ingest_behind, sort_data, true_data, cfh); + } + + Status DeprecatedAddFile(const std::vector& files, + bool move_files = false, + bool skip_snapshot_check = false, + bool skip_write_global_seqno = false) { + IngestExternalFileOptions opts; + opts.move_files = move_files; + opts.snapshot_consistency = !skip_snapshot_check; + opts.allow_global_seqno = false; + opts.allow_blocking_flush = false; + opts.write_global_seqno = !skip_write_global_seqno; + return db_->IngestExternalFile(files, opts); + } + + ~ExternalSSTFileTest() override { test::DestroyDir(env_, sst_files_dir_); } + + protected: + int last_file_id_ = 0; + std::string sst_files_dir_; +}; + +TEST_F(ExternalSSTFileTest, Basic) { + do { + Options options = CurrentOptions(); + + SstFileWriter sst_file_writer(EnvOptions(), options); + + // Current file size should be 0 after sst_file_writer init and before open a file. + ASSERT_EQ(sst_file_writer.FileSize(), 0); + + // file1.sst (0 => 99) + std::string file1 = sst_files_dir_ + "file1.sst"; + ASSERT_OK(sst_file_writer.Open(file1)); + for (int k = 0; k < 100; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file1_info; + Status s = sst_file_writer.Finish(&file1_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + + // Current file size should be non-zero after success write. + ASSERT_GT(sst_file_writer.FileSize(), 0); + + ASSERT_EQ(file1_info.file_path, file1); + ASSERT_EQ(file1_info.num_entries, 100); + ASSERT_EQ(file1_info.smallest_key, Key(0)); + ASSERT_EQ(file1_info.largest_key, Key(99)); + ASSERT_EQ(file1_info.num_range_del_entries, 0); + ASSERT_EQ(file1_info.smallest_range_del_key, ""); + ASSERT_EQ(file1_info.largest_range_del_key, ""); + // sst_file_writer already finished, cannot add this value + s = sst_file_writer.Put(Key(100), "bad_val"); + ASSERT_FALSE(s.ok()) << s.ToString(); + + // file2.sst (100 => 199) + std::string file2 = sst_files_dir_ + "file2.sst"; + ASSERT_OK(sst_file_writer.Open(file2)); + for (int k = 100; k < 200; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + // Cannot add this key because it's not after last added key + s = sst_file_writer.Put(Key(99), "bad_val"); + ASSERT_FALSE(s.ok()) << s.ToString(); + ExternalSstFileInfo file2_info; + s = sst_file_writer.Finish(&file2_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file2_info.file_path, file2); + ASSERT_EQ(file2_info.num_entries, 100); + ASSERT_EQ(file2_info.smallest_key, Key(100)); + ASSERT_EQ(file2_info.largest_key, Key(199)); + + // file3.sst (195 => 299) + // This file values overlap with file2 values + std::string file3 = sst_files_dir_ + "file3.sst"; + ASSERT_OK(sst_file_writer.Open(file3)); + for (int k = 195; k < 300; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap")); + } + ExternalSstFileInfo file3_info; + s = sst_file_writer.Finish(&file3_info); + + ASSERT_TRUE(s.ok()) << s.ToString(); + // Current file size should be non-zero after success finish. + ASSERT_GT(sst_file_writer.FileSize(), 0); + ASSERT_EQ(file3_info.file_path, file3); + ASSERT_EQ(file3_info.num_entries, 105); + ASSERT_EQ(file3_info.smallest_key, Key(195)); + ASSERT_EQ(file3_info.largest_key, Key(299)); + + // file4.sst (30 => 39) + // This file values overlap with file1 values + std::string file4 = sst_files_dir_ + "file4.sst"; + ASSERT_OK(sst_file_writer.Open(file4)); + for (int k = 30; k < 40; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap")); + } + ExternalSstFileInfo file4_info; + s = sst_file_writer.Finish(&file4_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file4_info.file_path, file4); + ASSERT_EQ(file4_info.num_entries, 10); + ASSERT_EQ(file4_info.smallest_key, Key(30)); + ASSERT_EQ(file4_info.largest_key, Key(39)); + + // file5.sst (400 => 499) + std::string file5 = sst_files_dir_ + "file5.sst"; + ASSERT_OK(sst_file_writer.Open(file5)); + for (int k = 400; k < 500; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file5_info; + s = sst_file_writer.Finish(&file5_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file5_info.file_path, file5); + ASSERT_EQ(file5_info.num_entries, 100); + ASSERT_EQ(file5_info.smallest_key, Key(400)); + ASSERT_EQ(file5_info.largest_key, Key(499)); + + // file6.sst (delete 400 => 500) + std::string file6 = sst_files_dir_ + "file6.sst"; + ASSERT_OK(sst_file_writer.Open(file6)); + sst_file_writer.DeleteRange(Key(400), Key(500)); + ExternalSstFileInfo file6_info; + s = sst_file_writer.Finish(&file6_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file6_info.file_path, file6); + ASSERT_EQ(file6_info.num_entries, 0); + ASSERT_EQ(file6_info.smallest_key, ""); + ASSERT_EQ(file6_info.largest_key, ""); + ASSERT_EQ(file6_info.num_range_del_entries, 1); + ASSERT_EQ(file6_info.smallest_range_del_key, Key(400)); + ASSERT_EQ(file6_info.largest_range_del_key, Key(500)); + + // file7.sst (delete 500 => 570, put 520 => 599 divisible by 2) + std::string file7 = sst_files_dir_ + "file7.sst"; + ASSERT_OK(sst_file_writer.Open(file7)); + sst_file_writer.DeleteRange(Key(500), Key(550)); + for (int k = 520; k < 560; k += 2) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + sst_file_writer.DeleteRange(Key(525), Key(575)); + for (int k = 560; k < 600; k += 2) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file7_info; + s = sst_file_writer.Finish(&file7_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file7_info.file_path, file7); + ASSERT_EQ(file7_info.num_entries, 40); + ASSERT_EQ(file7_info.smallest_key, Key(520)); + ASSERT_EQ(file7_info.largest_key, Key(598)); + ASSERT_EQ(file7_info.num_range_del_entries, 2); + ASSERT_EQ(file7_info.smallest_range_del_key, Key(500)); + ASSERT_EQ(file7_info.largest_range_del_key, Key(575)); + + // file8.sst (delete 600 => 700) + std::string file8 = sst_files_dir_ + "file8.sst"; + ASSERT_OK(sst_file_writer.Open(file8)); + sst_file_writer.DeleteRange(Key(600), Key(700)); + ExternalSstFileInfo file8_info; + s = sst_file_writer.Finish(&file8_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file8_info.file_path, file8); + ASSERT_EQ(file8_info.num_entries, 0); + ASSERT_EQ(file8_info.smallest_key, ""); + ASSERT_EQ(file8_info.largest_key, ""); + ASSERT_EQ(file8_info.num_range_del_entries, 1); + ASSERT_EQ(file8_info.smallest_range_del_key, Key(600)); + ASSERT_EQ(file8_info.largest_range_del_key, Key(700)); + + // Cannot create an empty sst file + std::string file_empty = sst_files_dir_ + "file_empty.sst"; + ExternalSstFileInfo file_empty_info; + s = sst_file_writer.Finish(&file_empty_info); + ASSERT_NOK(s); + + DestroyAndReopen(options); + // Add file using file path + s = DeprecatedAddFile({file1}); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); + for (int k = 0; k < 100; k++) { + ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); + } + + // Add file while holding a snapshot will fail + const Snapshot* s1 = db_->GetSnapshot(); + if (s1 != nullptr) { + ASSERT_NOK(DeprecatedAddFile({file2})); + db_->ReleaseSnapshot(s1); + } + // We can add the file after releaseing the snapshot + ASSERT_OK(DeprecatedAddFile({file2})); + + ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); + for (int k = 0; k < 200; k++) { + ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); + } + + // This file has overlapping values with the existing data + s = DeprecatedAddFile({file3}); + ASSERT_FALSE(s.ok()) << s.ToString(); + + // This file has overlapping values with the existing data + s = DeprecatedAddFile({file4}); + ASSERT_FALSE(s.ok()) << s.ToString(); + + // Overwrite values of keys divisible by 5 + for (int k = 0; k < 200; k += 5) { + ASSERT_OK(Put(Key(k), Key(k) + "_val_new")); + } + ASSERT_NE(db_->GetLatestSequenceNumber(), 0U); + + // Key range of file5 (400 => 499) dont overlap with any keys in DB + ASSERT_OK(DeprecatedAddFile({file5})); + + // This file has overlapping values with the existing data + s = DeprecatedAddFile({file6}); + ASSERT_FALSE(s.ok()) << s.ToString(); + + // Key range of file7 (500 => 598) dont overlap with any keys in DB + ASSERT_OK(DeprecatedAddFile({file7})); + + // Key range of file7 (600 => 700) dont overlap with any keys in DB + ASSERT_OK(DeprecatedAddFile({file8})); + + // Make sure values are correct before and after flush/compaction + for (int i = 0; i < 2; i++) { + for (int k = 0; k < 200; k++) { + std::string value = Key(k) + "_val"; + if (k % 5 == 0) { + value += "_new"; + } + ASSERT_EQ(Get(Key(k)), value); + } + for (int k = 400; k < 500; k++) { + std::string value = Key(k) + "_val"; + ASSERT_EQ(Get(Key(k)), value); + } + for (int k = 500; k < 600; k++) { + std::string value = Key(k) + "_val"; + if (k < 520 || k % 2 == 1) { + value = "NOT_FOUND"; + } + ASSERT_EQ(Get(Key(k)), value); + } + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + } + + Close(); + options.disable_auto_compactions = true; + Reopen(options); + + // Delete keys in range (400 => 499) + for (int k = 400; k < 500; k++) { + ASSERT_OK(Delete(Key(k))); + } + // We deleted range (400 => 499) but cannot add file5 because + // of the range tombstones + ASSERT_NOK(DeprecatedAddFile({file5})); + + // Compacting the DB will remove the tombstones + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + // Now we can add the file + ASSERT_OK(DeprecatedAddFile({file5})); + + // Verify values of file5 in DB + for (int k = 400; k < 500; k++) { + std::string value = Key(k) + "_val"; + ASSERT_EQ(Get(Key(k)), value); + } + DestroyAndRecreateExternalSSTFilesDir(); + } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction | + kRangeDelSkipConfigs)); +} + +class SstFileWriterCollector : public TablePropertiesCollector { + public: + explicit SstFileWriterCollector(const std::string prefix) : prefix_(prefix) { + name_ = prefix_ + "_SstFileWriterCollector"; + } + + const char* Name() const override { return name_.c_str(); } + + Status Finish(UserCollectedProperties* properties) override { + std::string count = std::to_string(count_); + *properties = UserCollectedProperties{ + {prefix_ + "_SstFileWriterCollector", "YES"}, + {prefix_ + "_Count", count}, + }; + return Status::OK(); + } + + Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/, + EntryType /*type*/, SequenceNumber /*seq*/, + uint64_t /*file_size*/) override { + ++count_; + return Status::OK(); + } + + UserCollectedProperties GetReadableProperties() const override { + return UserCollectedProperties{}; + } + + private: + uint32_t count_ = 0; + std::string prefix_; + std::string name_; +}; + +class SstFileWriterCollectorFactory : public TablePropertiesCollectorFactory { + public: + explicit SstFileWriterCollectorFactory(std::string prefix) + : prefix_(prefix), num_created_(0) {} + TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context /*context*/) override { + num_created_++; + return new SstFileWriterCollector(prefix_); + } + const char* Name() const override { return "SstFileWriterCollectorFactory"; } + + std::string prefix_; + uint32_t num_created_; +}; + +TEST_F(ExternalSSTFileTest, AddList) { + do { + Options options = CurrentOptions(); + + auto abc_collector = std::make_shared("abc"); + auto xyz_collector = std::make_shared("xyz"); + + options.table_properties_collector_factories.emplace_back(abc_collector); + options.table_properties_collector_factories.emplace_back(xyz_collector); + + SstFileWriter sst_file_writer(EnvOptions(), options); + + // file1.sst (0 => 99) + std::string file1 = sst_files_dir_ + "file1.sst"; + ASSERT_OK(sst_file_writer.Open(file1)); + for (int k = 0; k < 100; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file1_info; + Status s = sst_file_writer.Finish(&file1_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file1_info.file_path, file1); + ASSERT_EQ(file1_info.num_entries, 100); + ASSERT_EQ(file1_info.smallest_key, Key(0)); + ASSERT_EQ(file1_info.largest_key, Key(99)); + // sst_file_writer already finished, cannot add this value + s = sst_file_writer.Put(Key(100), "bad_val"); + ASSERT_FALSE(s.ok()) << s.ToString(); + + // file2.sst (100 => 199) + std::string file2 = sst_files_dir_ + "file2.sst"; + ASSERT_OK(sst_file_writer.Open(file2)); + for (int k = 100; k < 200; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + // Cannot add this key because it's not after last added key + s = sst_file_writer.Put(Key(99), "bad_val"); + ASSERT_FALSE(s.ok()) << s.ToString(); + ExternalSstFileInfo file2_info; + s = sst_file_writer.Finish(&file2_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file2_info.file_path, file2); + ASSERT_EQ(file2_info.num_entries, 100); + ASSERT_EQ(file2_info.smallest_key, Key(100)); + ASSERT_EQ(file2_info.largest_key, Key(199)); + + // file3.sst (195 => 199) + // This file values overlap with file2 values + std::string file3 = sst_files_dir_ + "file3.sst"; + ASSERT_OK(sst_file_writer.Open(file3)); + for (int k = 195; k < 200; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap")); + } + ExternalSstFileInfo file3_info; + s = sst_file_writer.Finish(&file3_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file3_info.file_path, file3); + ASSERT_EQ(file3_info.num_entries, 5); + ASSERT_EQ(file3_info.smallest_key, Key(195)); + ASSERT_EQ(file3_info.largest_key, Key(199)); + + // file4.sst (30 => 39) + // This file values overlap with file1 values + std::string file4 = sst_files_dir_ + "file4.sst"; + ASSERT_OK(sst_file_writer.Open(file4)); + for (int k = 30; k < 40; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap")); + } + ExternalSstFileInfo file4_info; + s = sst_file_writer.Finish(&file4_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file4_info.file_path, file4); + ASSERT_EQ(file4_info.num_entries, 10); + ASSERT_EQ(file4_info.smallest_key, Key(30)); + ASSERT_EQ(file4_info.largest_key, Key(39)); + + // file5.sst (200 => 299) + std::string file5 = sst_files_dir_ + "file5.sst"; + ASSERT_OK(sst_file_writer.Open(file5)); + for (int k = 200; k < 300; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file5_info; + s = sst_file_writer.Finish(&file5_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file5_info.file_path, file5); + ASSERT_EQ(file5_info.num_entries, 100); + ASSERT_EQ(file5_info.smallest_key, Key(200)); + ASSERT_EQ(file5_info.largest_key, Key(299)); + + // file6.sst (delete 0 => 100) + std::string file6 = sst_files_dir_ + "file6.sst"; + ASSERT_OK(sst_file_writer.Open(file6)); + ASSERT_OK(sst_file_writer.DeleteRange(Key(0), Key(75))); + ASSERT_OK(sst_file_writer.DeleteRange(Key(25), Key(100))); + ExternalSstFileInfo file6_info; + s = sst_file_writer.Finish(&file6_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file6_info.file_path, file6); + ASSERT_EQ(file6_info.num_entries, 0); + ASSERT_EQ(file6_info.smallest_key, ""); + ASSERT_EQ(file6_info.largest_key, ""); + ASSERT_EQ(file6_info.num_range_del_entries, 2); + ASSERT_EQ(file6_info.smallest_range_del_key, Key(0)); + ASSERT_EQ(file6_info.largest_range_del_key, Key(100)); + + // file7.sst (delete 99 => 201) + std::string file7 = sst_files_dir_ + "file7.sst"; + ASSERT_OK(sst_file_writer.Open(file7)); + ASSERT_OK(sst_file_writer.DeleteRange(Key(99), Key(201))); + ExternalSstFileInfo file7_info; + s = sst_file_writer.Finish(&file7_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file7_info.file_path, file7); + ASSERT_EQ(file7_info.num_entries, 0); + ASSERT_EQ(file7_info.smallest_key, ""); + ASSERT_EQ(file7_info.largest_key, ""); + ASSERT_EQ(file7_info.num_range_del_entries, 1); + ASSERT_EQ(file7_info.smallest_range_del_key, Key(99)); + ASSERT_EQ(file7_info.largest_range_del_key, Key(201)); + + // list 1 has internal key range conflict + std::vector file_list0({file1, file2}); + std::vector file_list1({file3, file2, file1}); + std::vector file_list2({file5}); + std::vector file_list3({file3, file4}); + std::vector file_list4({file5, file7}); + std::vector file_list5({file6, file7}); + + DestroyAndReopen(options); + + // These lists of files have key ranges that overlap with each other + s = DeprecatedAddFile(file_list1); + ASSERT_FALSE(s.ok()) << s.ToString(); + // Both of the following overlap on the range deletion tombstone. + s = DeprecatedAddFile(file_list4); + ASSERT_FALSE(s.ok()) << s.ToString(); + s = DeprecatedAddFile(file_list5); + ASSERT_FALSE(s.ok()) << s.ToString(); + + // Add files using file path list + s = DeprecatedAddFile(file_list0); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); + for (int k = 0; k < 200; k++) { + ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); + } + + TablePropertiesCollection props; + ASSERT_OK(db_->GetPropertiesOfAllTables(&props)); + ASSERT_EQ(props.size(), 2); + for (auto file_props : props) { + auto user_props = file_props.second->user_collected_properties; + ASSERT_EQ(user_props["abc_SstFileWriterCollector"], "YES"); + ASSERT_EQ(user_props["xyz_SstFileWriterCollector"], "YES"); + ASSERT_EQ(user_props["abc_Count"], "100"); + ASSERT_EQ(user_props["xyz_Count"], "100"); + } + + // Add file while holding a snapshot will fail + const Snapshot* s1 = db_->GetSnapshot(); + if (s1 != nullptr) { + ASSERT_NOK(DeprecatedAddFile(file_list2)); + db_->ReleaseSnapshot(s1); + } + // We can add the file after releaseing the snapshot + ASSERT_OK(DeprecatedAddFile(file_list2)); + ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); + for (int k = 0; k < 300; k++) { + ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); + } + + ASSERT_OK(db_->GetPropertiesOfAllTables(&props)); + ASSERT_EQ(props.size(), 3); + for (auto file_props : props) { + auto user_props = file_props.second->user_collected_properties; + ASSERT_EQ(user_props["abc_SstFileWriterCollector"], "YES"); + ASSERT_EQ(user_props["xyz_SstFileWriterCollector"], "YES"); + ASSERT_EQ(user_props["abc_Count"], "100"); + ASSERT_EQ(user_props["xyz_Count"], "100"); + } + + // This file list has overlapping values with the existing data + s = DeprecatedAddFile(file_list3); + ASSERT_FALSE(s.ok()) << s.ToString(); + + // Overwrite values of keys divisible by 5 + for (int k = 0; k < 200; k += 5) { + ASSERT_OK(Put(Key(k), Key(k) + "_val_new")); + } + ASSERT_NE(db_->GetLatestSequenceNumber(), 0U); + + // Make sure values are correct before and after flush/compaction + for (int i = 0; i < 2; i++) { + for (int k = 0; k < 200; k++) { + std::string value = Key(k) + "_val"; + if (k % 5 == 0) { + value += "_new"; + } + ASSERT_EQ(Get(Key(k)), value); + } + for (int k = 200; k < 300; k++) { + std::string value = Key(k) + "_val"; + ASSERT_EQ(Get(Key(k)), value); + } + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + } + + // Delete keys in range (200 => 299) + for (int k = 200; k < 300; k++) { + ASSERT_OK(Delete(Key(k))); + } + // We deleted range (200 => 299) but cannot add file5 because + // of the range tombstones + ASSERT_NOK(DeprecatedAddFile(file_list2)); + + // Compacting the DB will remove the tombstones + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + // Now we can add the file + ASSERT_OK(DeprecatedAddFile(file_list2)); + + // Verify values of file5 in DB + for (int k = 200; k < 300; k++) { + std::string value = Key(k) + "_val"; + ASSERT_EQ(Get(Key(k)), value); + } + DestroyAndRecreateExternalSSTFilesDir(); + } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction | + kRangeDelSkipConfigs)); +} + +TEST_F(ExternalSSTFileTest, AddListAtomicity) { + do { + Options options = CurrentOptions(); + + SstFileWriter sst_file_writer(EnvOptions(), options); + + // files[0].sst (0 => 99) + // files[1].sst (100 => 199) + // ... + // file[8].sst (800 => 899) + int n = 9; + std::vector files(n); + std::vector files_info(n); + for (int i = 0; i < n; i++) { + files[i] = sst_files_dir_ + "file" + std::to_string(i) + ".sst"; + ASSERT_OK(sst_file_writer.Open(files[i])); + for (int k = i * 100; k < (i + 1) * 100; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + Status s = sst_file_writer.Finish(&files_info[i]); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(files_info[i].file_path, files[i]); + ASSERT_EQ(files_info[i].num_entries, 100); + ASSERT_EQ(files_info[i].smallest_key, Key(i * 100)); + ASSERT_EQ(files_info[i].largest_key, Key((i + 1) * 100 - 1)); + } + files.push_back(sst_files_dir_ + "file" + std::to_string(n) + ".sst"); + auto s = DeprecatedAddFile(files); + ASSERT_NOK(s) << s.ToString(); + for (int k = 0; k < n * 100; k++) { + ASSERT_EQ("NOT_FOUND", Get(Key(k))); + } + files.pop_back(); + ASSERT_OK(DeprecatedAddFile(files)); + for (int k = 0; k < n * 100; k++) { + std::string value = Key(k) + "_val"; + ASSERT_EQ(Get(Key(k)), value); + } + DestroyAndRecreateExternalSSTFilesDir(); + } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction)); +} +// This test reporduce a bug that can happen in some cases if the DB started +// purging obsolete files when we are adding an external sst file. +// This situation may result in deleting the file while it's being added. +TEST_F(ExternalSSTFileTest, PurgeObsoleteFilesBug) { + Options options = CurrentOptions(); + SstFileWriter sst_file_writer(EnvOptions(), options); + + // file1.sst (0 => 500) + std::string sst_file_path = sst_files_dir_ + "file1.sst"; + Status s = sst_file_writer.Open(sst_file_path); + ASSERT_OK(s); + for (int i = 0; i < 500; i++) { + std::string k = Key(i); + s = sst_file_writer.Put(k, k + "_val"); + ASSERT_OK(s); + } + + ExternalSstFileInfo sst_file_info; + s = sst_file_writer.Finish(&sst_file_info); + ASSERT_OK(s); + + options.delete_obsolete_files_period_micros = 0; + options.disable_auto_compactions = true; + DestroyAndReopen(options); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "ExternalSstFileIngestionJob::Prepare:FileAdded", [&](void* /* arg */) { + ASSERT_OK(Put("aaa", "bbb")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("aaa", "xxx")); + ASSERT_OK(Flush()); + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + s = DeprecatedAddFile({sst_file_path}); + ASSERT_OK(s); + + for (int i = 0; i < 500; i++) { + std::string k = Key(i); + std::string v = k + "_val"; + ASSERT_EQ(Get(k), v); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(ExternalSSTFileTest, SkipSnapshot) { + Options options = CurrentOptions(); + + SstFileWriter sst_file_writer(EnvOptions(), options); + + // file1.sst (0 => 99) + std::string file1 = sst_files_dir_ + "file1.sst"; + ASSERT_OK(sst_file_writer.Open(file1)); + for (int k = 0; k < 100; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file1_info; + Status s = sst_file_writer.Finish(&file1_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file1_info.file_path, file1); + ASSERT_EQ(file1_info.num_entries, 100); + ASSERT_EQ(file1_info.smallest_key, Key(0)); + ASSERT_EQ(file1_info.largest_key, Key(99)); + + // file2.sst (100 => 299) + std::string file2 = sst_files_dir_ + "file2.sst"; + ASSERT_OK(sst_file_writer.Open(file2)); + for (int k = 100; k < 300; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file2_info; + s = sst_file_writer.Finish(&file2_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file2_info.file_path, file2); + ASSERT_EQ(file2_info.num_entries, 200); + ASSERT_EQ(file2_info.smallest_key, Key(100)); + ASSERT_EQ(file2_info.largest_key, Key(299)); + + ASSERT_OK(DeprecatedAddFile({file1})); + + // Add file will fail when holding snapshot and use the default + // skip_snapshot_check to false + const Snapshot* s1 = db_->GetSnapshot(); + if (s1 != nullptr) { + ASSERT_NOK(DeprecatedAddFile({file2})); + } + + // Add file will success when set skip_snapshot_check to true even db holding + // snapshot + if (s1 != nullptr) { + ASSERT_OK(DeprecatedAddFile({file2}, false, true)); + db_->ReleaseSnapshot(s1); + } + + // file3.sst (300 => 399) + std::string file3 = sst_files_dir_ + "file3.sst"; + ASSERT_OK(sst_file_writer.Open(file3)); + for (int k = 300; k < 400; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file3_info; + s = sst_file_writer.Finish(&file3_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file3_info.file_path, file3); + ASSERT_EQ(file3_info.num_entries, 100); + ASSERT_EQ(file3_info.smallest_key, Key(300)); + ASSERT_EQ(file3_info.largest_key, Key(399)); + + // check that we have change the old key + ASSERT_EQ(Get(Key(300)), "NOT_FOUND"); + const Snapshot* s2 = db_->GetSnapshot(); + ASSERT_OK(DeprecatedAddFile({file3}, false, true)); + ASSERT_EQ(Get(Key(300)), Key(300) + ("_val")); + ASSERT_EQ(Get(Key(300), s2), Key(300) + ("_val")); + + db_->ReleaseSnapshot(s2); +} + +TEST_F(ExternalSSTFileTest, MultiThreaded) { + // Bulk load 10 files every file contain 1000 keys + int num_files = 10; + int keys_per_file = 1000; + + // Generate file names + std::vector file_names; + for (int i = 0; i < num_files; i++) { + std::string file_name = "file_" + ToString(i) + ".sst"; + file_names.push_back(sst_files_dir_ + file_name); + } + + do { + Options options = CurrentOptions(); + + std::atomic thread_num(0); + std::function write_file_func = [&]() { + int file_idx = thread_num.fetch_add(1); + int range_start = file_idx * keys_per_file; + int range_end = range_start + keys_per_file; + + SstFileWriter sst_file_writer(EnvOptions(), options); + + ASSERT_OK(sst_file_writer.Open(file_names[file_idx])); + + for (int k = range_start; k < range_end; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k))); + } + + Status s = sst_file_writer.Finish(); + ASSERT_TRUE(s.ok()) << s.ToString(); + }; + // Write num_files files in parallel + std::vector sst_writer_threads; + for (int i = 0; i < num_files; ++i) { + sst_writer_threads.emplace_back(write_file_func); + } + + for (auto& t : sst_writer_threads) { + t.join(); + } + + fprintf(stderr, "Wrote %d files (%d keys)\n", num_files, + num_files * keys_per_file); + + thread_num.store(0); + std::atomic files_added(0); + // Thread 0 -> Load {f0,f1} + // Thread 1 -> Load {f0,f1} + // Thread 2 -> Load {f2,f3} + // Thread 3 -> Load {f2,f3} + // Thread 4 -> Load {f4,f5} + // Thread 5 -> Load {f4,f5} + // ... + std::function load_file_func = [&]() { + // We intentionally add every file twice, and assert that it was added + // only once and the other add failed + int thread_id = thread_num.fetch_add(1); + int file_idx = (thread_id / 2) * 2; + // sometimes we use copy, sometimes link .. the result should be the same + bool move_file = (thread_id % 3 == 0); + + std::vector files_to_add; + + files_to_add = {file_names[file_idx]}; + if (static_cast(file_idx + 1) < file_names.size()) { + files_to_add.push_back(file_names[file_idx + 1]); + } + + Status s = DeprecatedAddFile(files_to_add, move_file); + if (s.ok()) { + files_added += static_cast(files_to_add.size()); + } + }; + + // Bulk load num_files files in parallel + std::vector add_file_threads; + DestroyAndReopen(options); + for (int i = 0; i < num_files; ++i) { + add_file_threads.emplace_back(load_file_func); + } + + for (auto& t : add_file_threads) { + t.join(); + } + ASSERT_EQ(files_added.load(), num_files); + fprintf(stderr, "Loaded %d files (%d keys)\n", num_files, + num_files * keys_per_file); + + // Overwrite values of keys divisible by 100 + for (int k = 0; k < num_files * keys_per_file; k += 100) { + std::string key = Key(k); + Status s = Put(key, key + "_new"); + ASSERT_TRUE(s.ok()); + } + + for (int i = 0; i < 2; i++) { + // Make sure the values are correct before and after flush/compaction + for (int k = 0; k < num_files * keys_per_file; ++k) { + std::string key = Key(k); + std::string value = (k % 100 == 0) ? (key + "_new") : key; + ASSERT_EQ(Get(key), value); + } + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + } + + fprintf(stderr, "Verified %d values\n", num_files * keys_per_file); + DestroyAndRecreateExternalSSTFilesDir(); + } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction)); +} + +TEST_F(ExternalSSTFileTest, OverlappingRanges) { + Random rnd(301); + SequenceNumber assigned_seqno = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "ExternalSstFileIngestionJob::Run", [&assigned_seqno](void* arg) { + ASSERT_TRUE(arg != nullptr); + assigned_seqno = *(static_cast(arg)); + }); + bool need_flush = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::IngestExternalFile:NeedFlush", [&need_flush](void* arg) { + ASSERT_TRUE(arg != nullptr); + need_flush = *(static_cast(arg)); + }); + bool overlap_with_db = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile", + [&overlap_with_db](void* arg) { + ASSERT_TRUE(arg != nullptr); + overlap_with_db = *(static_cast(arg)); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + do { + Options options = CurrentOptions(); + DestroyAndReopen(options); + + SstFileWriter sst_file_writer(EnvOptions(), options); + + printf("Option config = %d\n", option_config_); + std::vector> key_ranges; + for (int i = 0; i < 100; i++) { + int range_start = rnd.Uniform(20000); + int keys_per_range = 10 + rnd.Uniform(41); + + key_ranges.emplace_back(range_start, range_start + keys_per_range); + } + + int memtable_add = 0; + int success_add_file = 0; + int failed_add_file = 0; + std::map true_data; + for (size_t i = 0; i < key_ranges.size(); i++) { + int range_start = key_ranges[i].first; + int range_end = key_ranges[i].second; + + Status s; + std::string range_val = "range_" + ToString(i); + + // For 20% of ranges we use DB::Put, for 80% we use DB::AddFile + if (i && i % 5 == 0) { + // Use DB::Put to insert range (insert into memtable) + range_val += "_put"; + for (int k = range_start; k <= range_end; k++) { + s = Put(Key(k), range_val); + ASSERT_OK(s); + } + memtable_add++; + } else { + // Use DB::AddFile to insert range + range_val += "_add_file"; + + // Generate the file containing the range + std::string file_name = sst_files_dir_ + env_->GenerateUniqueId(); + ASSERT_OK(sst_file_writer.Open(file_name)); + for (int k = range_start; k <= range_end; k++) { + s = sst_file_writer.Put(Key(k), range_val); + ASSERT_OK(s); + } + ExternalSstFileInfo file_info; + s = sst_file_writer.Finish(&file_info); + ASSERT_OK(s); + + // Insert the generated file + s = DeprecatedAddFile({file_name}); + auto it = true_data.lower_bound(Key(range_start)); + if (option_config_ != kUniversalCompaction && + option_config_ != kUniversalCompactionMultiLevel && + option_config_ != kUniversalSubcompactions) { + if (it != true_data.end() && it->first <= Key(range_end)) { + // This range overlap with data already exist in DB + ASSERT_NOK(s); + failed_add_file++; + } else { + ASSERT_OK(s); + success_add_file++; + } + } else { + if ((it != true_data.end() && it->first <= Key(range_end)) || + need_flush || assigned_seqno > 0 || overlap_with_db) { + // This range overlap with data already exist in DB + ASSERT_NOK(s); + failed_add_file++; + } else { + ASSERT_OK(s); + success_add_file++; + } + } + } + + if (s.ok()) { + // Update true_data map to include the new inserted data + for (int k = range_start; k <= range_end; k++) { + true_data[Key(k)] = range_val; + } + } + + // Flush / Compact the DB + if (i && i % 50 == 0) { + Flush(); + } + if (i && i % 75 == 0) { + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + } + } + + printf("Total: %" ROCKSDB_PRIszt + " ranges\n" + "AddFile()|Success: %d ranges\n" + "AddFile()|RangeConflict: %d ranges\n" + "Put(): %d ranges\n", + key_ranges.size(), success_add_file, failed_add_file, memtable_add); + + // Verify the correctness of the data + for (const auto& kv : true_data) { + ASSERT_EQ(Get(kv.first), kv.second); + } + printf("keys/values verified\n"); + DestroyAndRecreateExternalSSTFilesDir(); + } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction)); +} + +TEST_P(ExternalSSTFileTest, PickedLevel) { + Options options = CurrentOptions(); + options.disable_auto_compactions = false; + options.level0_file_num_compaction_trigger = 4; + options.num_levels = 4; + DestroyAndReopen(options); + + std::map true_data; + + // File 0 will go to last level (L3) + ASSERT_OK(GenerateAndAddExternalFile(options, {1, 10}, -1, false, false, true, + false, false, &true_data)); + EXPECT_EQ(FilesPerLevel(), "0,0,0,1"); + + // File 1 will go to level L2 (since it overlap with file 0 in L3) + ASSERT_OK(GenerateAndAddExternalFile(options, {2, 9}, -1, false, false, true, + false, false, &true_data)); + EXPECT_EQ(FilesPerLevel(), "0,0,1,1"); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"ExternalSSTFileTest::PickedLevel:0", "BackgroundCallCompaction:0"}, + {"DBImpl::BackgroundCompaction:Start", + "ExternalSSTFileTest::PickedLevel:1"}, + {"ExternalSSTFileTest::PickedLevel:2", + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Flush 4 files containing the same keys + for (int i = 0; i < 4; i++) { + ASSERT_OK(Put(Key(3), Key(3) + "put")); + ASSERT_OK(Put(Key(8), Key(8) + "put")); + true_data[Key(3)] = Key(3) + "put"; + true_data[Key(8)] = Key(8) + "put"; + ASSERT_OK(Flush()); + } + + // Wait for BackgroundCompaction() to be called + TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevel:0"); + TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevel:1"); + + EXPECT_EQ(FilesPerLevel(), "4,0,1,1"); + + // This file overlaps with file 0 (L3), file 1 (L2) and the + // output of compaction going to L1 + ASSERT_OK(GenerateAndAddExternalFile(options, {4, 7}, -1, false, false, true, + false, false, &true_data)); + EXPECT_EQ(FilesPerLevel(), "5,0,1,1"); + + // This file does not overlap with any file or with the running compaction + ASSERT_OK(GenerateAndAddExternalFile(options, {9000, 9001}, -1, false, false, + false, false, false, &true_data)); + EXPECT_EQ(FilesPerLevel(), "5,0,1,2"); + + // Hold compaction from finishing + TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevel:2"); + + dbfull()->TEST_WaitForCompact(); + EXPECT_EQ(FilesPerLevel(), "1,1,1,2"); + + size_t kcnt = 0; + VerifyDBFromMap(true_data, &kcnt, false); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(ExternalSSTFileTest, PickedLevelBug) { + Options options = CurrentOptions(); + options.disable_auto_compactions = false; + options.level0_file_num_compaction_trigger = 3; + options.num_levels = 2; + DestroyAndReopen(options); + + std::vector file_keys; + + // file #1 in L0 + file_keys = {0, 5, 7}; + for (int k : file_keys) { + ASSERT_OK(Put(Key(k), Key(k))); + } + ASSERT_OK(Flush()); + + // file #2 in L0 + file_keys = {4, 6, 8, 9}; + for (int k : file_keys) { + ASSERT_OK(Put(Key(k), Key(k))); + } + ASSERT_OK(Flush()); + + // We have 2 overlapping files in L0 + EXPECT_EQ(FilesPerLevel(), "2"); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::AddFile:MutexLock", "ExternalSSTFileTest::PickedLevelBug:0"}, + {"ExternalSSTFileTest::PickedLevelBug:1", "DBImpl::AddFile:MutexUnlock"}, + {"ExternalSSTFileTest::PickedLevelBug:2", + "DBImpl::RunManualCompaction:0"}, + {"ExternalSSTFileTest::PickedLevelBug:3", + "DBImpl::RunManualCompaction:1"}}); + + std::atomic bg_compact_started(false); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:Start", + [&](void* /*arg*/) { bg_compact_started.store(true); }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // While writing the MANIFEST start a thread that will ask for compaction + ROCKSDB_NAMESPACE::port::Thread bg_compact([&]() { + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + }); + TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:2"); + + // Start a thread that will ingest a new file + ROCKSDB_NAMESPACE::port::Thread bg_addfile([&]() { + file_keys = {1, 2, 3}; + ASSERT_OK(GenerateAndAddExternalFile(options, file_keys, 1)); + }); + + // Wait for AddFile to start picking levels and writing MANIFEST + TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:0"); + + TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:3"); + + // We need to verify that no compactions can run while AddFile is + // ingesting the files into the levels it find suitable. So we will + // wait for 2 seconds to give a chance for compactions to run during + // this period, and then make sure that no compactions where able to run + env_->SleepForMicroseconds(1000000 * 2); + ASSERT_FALSE(bg_compact_started.load()); + + // Hold AddFile from finishing writing the MANIFEST + TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:1"); + + bg_addfile.join(); + bg_compact.join(); + + dbfull()->TEST_WaitForCompact(); + + int total_keys = 0; + Iterator* iter = db_->NewIterator(ReadOptions()); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + total_keys++; + } + ASSERT_EQ(total_keys, 10); + + delete iter; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(ExternalSSTFileTest, IngestNonExistingFile) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + + Status s = db_->IngestExternalFile({"non_existing_file"}, + IngestExternalFileOptions()); + ASSERT_NOK(s); + + // Verify file deletion is not impacted (verify a bug fix) + ASSERT_OK(Put(Key(1), Key(1))); + ASSERT_OK(Put(Key(9), Key(9))); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(Key(1), Key(1))); + ASSERT_OK(Put(Key(9), Key(9))); + ASSERT_OK(Flush()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + + // After full compaction, there should be only 1 file. + std::vector files; + env_->GetChildren(dbname_, &files); + int num_sst_files = 0; + for (auto& f : files) { + uint64_t number; + FileType type; + if (ParseFileName(f, &number, &type) && type == kTableFile) { + num_sst_files++; + } + } + ASSERT_EQ(1, num_sst_files); +} + +TEST_F(ExternalSSTFileTest, CompactDuringAddFileRandom) { + Options options = CurrentOptions(); + options.disable_auto_compactions = false; + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 2; + DestroyAndReopen(options); + + std::function bg_compact = [&]() { + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + }; + + int range_id = 0; + std::vector file_keys; + std::function bg_addfile = [&]() { + ASSERT_OK(GenerateAndAddExternalFile(options, file_keys, range_id)); + }; + + const int num_of_ranges = 1000; + std::vector threads; + while (range_id < num_of_ranges) { + int range_start = range_id * 10; + int range_end = range_start + 10; + + file_keys.clear(); + for (int k = range_start + 1; k < range_end; k++) { + file_keys.push_back(k); + } + ASSERT_OK(Put(Key(range_start), Key(range_start))); + ASSERT_OK(Put(Key(range_end), Key(range_end))); + ASSERT_OK(Flush()); + + if (range_id % 10 == 0) { + threads.emplace_back(bg_compact); + } + threads.emplace_back(bg_addfile); + + for (auto& t : threads) { + t.join(); + } + threads.clear(); + + range_id++; + } + + for (int rid = 0; rid < num_of_ranges; rid++) { + int range_start = rid * 10; + int range_end = range_start + 10; + + ASSERT_EQ(Get(Key(range_start)), Key(range_start)) << rid; + ASSERT_EQ(Get(Key(range_end)), Key(range_end)) << rid; + for (int k = range_start + 1; k < range_end; k++) { + std::string v = Key(k) + ToString(rid); + ASSERT_EQ(Get(Key(k)), v) << rid; + } + } +} + +TEST_F(ExternalSSTFileTest, PickedLevelDynamic) { + Options options = CurrentOptions(); + options.disable_auto_compactions = false; + options.level0_file_num_compaction_trigger = 4; + options.level_compaction_dynamic_level_bytes = true; + options.num_levels = 4; + DestroyAndReopen(options); + std::map true_data; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"ExternalSSTFileTest::PickedLevelDynamic:0", + "BackgroundCallCompaction:0"}, + {"DBImpl::BackgroundCompaction:Start", + "ExternalSSTFileTest::PickedLevelDynamic:1"}, + {"ExternalSSTFileTest::PickedLevelDynamic:2", + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Flush 4 files containing the same keys + for (int i = 0; i < 4; i++) { + for (int k = 20; k <= 30; k++) { + ASSERT_OK(Put(Key(k), Key(k) + "put")); + true_data[Key(k)] = Key(k) + "put"; + } + for (int k = 50; k <= 60; k++) { + ASSERT_OK(Put(Key(k), Key(k) + "put")); + true_data[Key(k)] = Key(k) + "put"; + } + ASSERT_OK(Flush()); + } + + // Wait for BackgroundCompaction() to be called + TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelDynamic:0"); + TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelDynamic:1"); + + // This file overlaps with the output of the compaction (going to L3) + // so the file will be added to L0 since L3 is the base level + ASSERT_OK(GenerateAndAddExternalFile(options, {31, 32, 33, 34}, -1, false, + false, true, false, false, &true_data)); + EXPECT_EQ(FilesPerLevel(), "5"); + + // This file does not overlap with the current running compactiong + ASSERT_OK(GenerateAndAddExternalFile(options, {9000, 9001}, -1, false, false, + true, false, false, &true_data)); + EXPECT_EQ(FilesPerLevel(), "5,0,0,1"); + + // Hold compaction from finishing + TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelDynamic:2"); + + // Output of the compaction will go to L3 + dbfull()->TEST_WaitForCompact(); + EXPECT_EQ(FilesPerLevel(), "1,0,0,2"); + + Close(); + options.disable_auto_compactions = true; + Reopen(options); + + ASSERT_OK(GenerateAndAddExternalFile(options, {1, 15, 19}, -1, false, false, + true, false, false, &true_data)); + ASSERT_EQ(FilesPerLevel(), "1,0,0,3"); + + ASSERT_OK(GenerateAndAddExternalFile(options, {1000, 1001, 1002}, -1, false, + false, true, false, false, &true_data)); + ASSERT_EQ(FilesPerLevel(), "1,0,0,4"); + + ASSERT_OK(GenerateAndAddExternalFile(options, {500, 600, 700}, -1, false, + false, true, false, false, &true_data)); + ASSERT_EQ(FilesPerLevel(), "1,0,0,5"); + + // File 5 overlaps with file 2 (L3 / base level) + ASSERT_OK(GenerateAndAddExternalFile(options, {2, 10}, -1, false, false, true, + false, false, &true_data)); + ASSERT_EQ(FilesPerLevel(), "2,0,0,5"); + + // File 6 overlaps with file 2 (L3 / base level) and file 5 (L0) + ASSERT_OK(GenerateAndAddExternalFile(options, {3, 9}, -1, false, false, true, + false, false, &true_data)); + ASSERT_EQ(FilesPerLevel(), "3,0,0,5"); + + // Verify data in files + size_t kcnt = 0; + VerifyDBFromMap(true_data, &kcnt, false); + + // Write range [5 => 10] to L0 + for (int i = 5; i <= 10; i++) { + std::string k = Key(i); + std::string v = k + "put"; + ASSERT_OK(Put(k, v)); + true_data[k] = v; + } + ASSERT_OK(Flush()); + ASSERT_EQ(FilesPerLevel(), "4,0,0,5"); + + // File 7 overlaps with file 4 (L3) + ASSERT_OK(GenerateAndAddExternalFile(options, {650, 651, 652}, -1, false, + false, true, false, false, &true_data)); + ASSERT_EQ(FilesPerLevel(), "5,0,0,5"); + + VerifyDBFromMap(true_data, &kcnt, false); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(ExternalSSTFileTest, AddExternalSstFileWithCustomCompartor) { + Options options = CurrentOptions(); + options.comparator = ReverseBytewiseComparator(); + DestroyAndReopen(options); + + SstFileWriter sst_file_writer(EnvOptions(), options); + + // Generate files with these key ranges + // {14 -> 0} + // {24 -> 10} + // {34 -> 20} + // {44 -> 30} + // .. + std::vector generated_files; + for (int i = 0; i < 10; i++) { + std::string file_name = sst_files_dir_ + env_->GenerateUniqueId(); + ASSERT_OK(sst_file_writer.Open(file_name)); + + int range_end = i * 10; + int range_start = range_end + 15; + for (int k = (range_start - 1); k >= range_end; k--) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k))); + } + ExternalSstFileInfo file_info; + ASSERT_OK(sst_file_writer.Finish(&file_info)); + generated_files.push_back(file_name); + } + + std::vector in_files; + + // These 2nd and 3rd files overlap with each other + in_files = {generated_files[0], generated_files[4], generated_files[5], + generated_files[7]}; + ASSERT_NOK(DeprecatedAddFile(in_files)); + + // These 2 files dont overlap with each other + in_files = {generated_files[0], generated_files[2]}; + ASSERT_OK(DeprecatedAddFile(in_files)); + + // These 2 files dont overlap with each other but overlap with keys in DB + in_files = {generated_files[3], generated_files[7]}; + ASSERT_NOK(DeprecatedAddFile(in_files)); + + // Files dont overlap and dont overlap with DB key range + in_files = {generated_files[4], generated_files[6], generated_files[8]}; + ASSERT_OK(DeprecatedAddFile(in_files)); + + for (int i = 0; i < 100; i++) { + if (i % 20 <= 14) { + ASSERT_EQ(Get(Key(i)), Key(i)); + } else { + ASSERT_EQ(Get(Key(i)), "NOT_FOUND"); + } + } +} + +TEST_F(ExternalSSTFileTest, AddFileTrivialMoveBug) { + Options options = CurrentOptions(); + options.num_levels = 3; + options.IncreaseParallelism(20); + DestroyAndReopen(options); + + ASSERT_OK(GenerateAndAddExternalFile(options, {1, 4}, 1)); // L3 + ASSERT_OK(GenerateAndAddExternalFile(options, {2, 3}, 2)); // L2 + + ASSERT_OK(GenerateAndAddExternalFile(options, {10, 14}, 3)); // L3 + ASSERT_OK(GenerateAndAddExternalFile(options, {12, 13}, 4)); // L2 + + ASSERT_OK(GenerateAndAddExternalFile(options, {20, 24}, 5)); // L3 + ASSERT_OK(GenerateAndAddExternalFile(options, {22, 23}, 6)); // L2 + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::Run():Start", [&](void* /*arg*/) { + // fit in L3 but will overlap with compaction so will be added + // to L2 but a compaction will trivially move it to L3 + // and break LSM consistency + static std::atomic called = {false}; + if (!called) { + called = true; + ASSERT_OK(dbfull()->SetOptions({{"max_bytes_for_level_base", "1"}})); + ASSERT_OK(GenerateAndAddExternalFile(options, {15, 16}, 7)); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + CompactRangeOptions cro; + cro.exclusive_manual_compaction = false; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + dbfull()->TEST_WaitForCompact(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(ExternalSSTFileTest, CompactAddedFiles) { + Options options = CurrentOptions(); + options.num_levels = 3; + DestroyAndReopen(options); + + ASSERT_OK(GenerateAndAddExternalFile(options, {1, 10}, 1)); // L3 + ASSERT_OK(GenerateAndAddExternalFile(options, {2, 9}, 2)); // L2 + ASSERT_OK(GenerateAndAddExternalFile(options, {3, 8}, 3)); // L1 + ASSERT_OK(GenerateAndAddExternalFile(options, {4, 7}, 4)); // L0 + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); +} + +TEST_F(ExternalSSTFileTest, SstFileWriterNonSharedKeys) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + std::string file_path = sst_files_dir_ + "/not_shared"; + SstFileWriter sst_file_writer(EnvOptions(), options); + + std::string suffix(100, 'X'); + ASSERT_OK(sst_file_writer.Open(file_path)); + ASSERT_OK(sst_file_writer.Put("A" + suffix, "VAL")); + ASSERT_OK(sst_file_writer.Put("BB" + suffix, "VAL")); + ASSERT_OK(sst_file_writer.Put("CC" + suffix, "VAL")); + ASSERT_OK(sst_file_writer.Put("CXD" + suffix, "VAL")); + ASSERT_OK(sst_file_writer.Put("CZZZ" + suffix, "VAL")); + ASSERT_OK(sst_file_writer.Put("ZAAAX" + suffix, "VAL")); + + ASSERT_OK(sst_file_writer.Finish()); + ASSERT_OK(DeprecatedAddFile({file_path})); +} + +TEST_F(ExternalSSTFileTest, WithUnorderedWrite) { + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::WriteImpl:UnorderedWriteAfterWriteWAL", + "ExternalSSTFileTest::WithUnorderedWrite:WaitWriteWAL"}, + {"DBImpl::WaitForPendingWrites:BeforeBlock", + "DBImpl::WriteImpl:BeforeUnorderedWriteMemtable"}}); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::IngestExternalFile:NeedFlush", [&](void* need_flush) { + ASSERT_TRUE(*reinterpret_cast(need_flush)); + }); + + Options options = CurrentOptions(); + options.unordered_write = true; + DestroyAndReopen(options); + Put("foo", "v1"); + SyncPoint::GetInstance()->EnableProcessing(); + port::Thread writer([&]() { Put("bar", "v2"); }); + + TEST_SYNC_POINT("ExternalSSTFileTest::WithUnorderedWrite:WaitWriteWAL"); + ASSERT_OK(GenerateAndAddExternalFile(options, {{"bar", "v3"}}, -1, + true /* allow_global_seqno */)); + ASSERT_EQ(Get("bar"), "v3"); + + writer.join(); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoRandomized) { + Options options = CurrentOptions(); + options.IncreaseParallelism(20); + options.level0_slowdown_writes_trigger = 256; + options.level0_stop_writes_trigger = 256; + + bool write_global_seqno = std::get<0>(GetParam()); + bool verify_checksums_before_ingest = std::get<1>(GetParam()); + for (int iter = 0; iter < 2; iter++) { + bool write_to_memtable = (iter == 0); + DestroyAndReopen(options); + + Random rnd(301); + std::map true_data; + for (int i = 0; i < 500; i++) { + std::vector> random_data; + for (int j = 0; j < 100; j++) { + std::string k; + std::string v; + test::RandomString(&rnd, rnd.Next() % 20, &k); + test::RandomString(&rnd, rnd.Next() % 50, &v); + random_data.emplace_back(k, v); + } + + if (write_to_memtable && rnd.OneIn(4)) { + // 25% of writes go through memtable + for (auto& entry : random_data) { + ASSERT_OK(Put(entry.first, entry.second)); + true_data[entry.first] = entry.second; + } + } else { + ASSERT_OK(GenerateAndAddExternalFile( + options, random_data, -1, true, write_global_seqno, + verify_checksums_before_ingest, false, true, &true_data)); + } + } + size_t kcnt = 0; + VerifyDBFromMap(true_data, &kcnt, false); + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + VerifyDBFromMap(true_data, &kcnt, false); + } +} + +TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoAssignedLevel) { + Options options = CurrentOptions(); + options.num_levels = 5; + options.disable_auto_compactions = true; + DestroyAndReopen(options); + std::vector> file_data; + std::map true_data; + + // Insert 100 -> 200 into the memtable + for (int i = 100; i <= 200; i++) { + ASSERT_OK(Put(Key(i), "memtable")); + true_data[Key(i)] = "memtable"; + } + + // Insert 0 -> 20 using AddFile + file_data.clear(); + for (int i = 0; i <= 20; i++) { + file_data.emplace_back(Key(i), "L4"); + } + bool write_global_seqno = std::get<0>(GetParam()); + bool verify_checksums_before_ingest = std::get<1>(GetParam()); + ASSERT_OK(GenerateAndAddExternalFile( + options, file_data, -1, true, write_global_seqno, + verify_checksums_before_ingest, false, false, &true_data)); + + // This file dont overlap with anything in the DB, will go to L4 + ASSERT_EQ("0,0,0,0,1", FilesPerLevel()); + + // Insert 80 -> 130 using AddFile + file_data.clear(); + for (int i = 80; i <= 130; i++) { + file_data.emplace_back(Key(i), "L0"); + } + ASSERT_OK(GenerateAndAddExternalFile( + options, file_data, -1, true, write_global_seqno, + verify_checksums_before_ingest, false, false, &true_data)); + + // This file overlap with the memtable, so it will flush it and add + // it self to L0 + ASSERT_EQ("2,0,0,0,1", FilesPerLevel()); + + // Insert 30 -> 50 using AddFile + file_data.clear(); + for (int i = 30; i <= 50; i++) { + file_data.emplace_back(Key(i), "L4"); + } + ASSERT_OK(GenerateAndAddExternalFile( + options, file_data, -1, true, write_global_seqno, + verify_checksums_before_ingest, false, false, &true_data)); + + // This file dont overlap with anything in the DB and fit in L4 as well + ASSERT_EQ("2,0,0,0,2", FilesPerLevel()); + + // Insert 10 -> 40 using AddFile + file_data.clear(); + for (int i = 10; i <= 40; i++) { + file_data.emplace_back(Key(i), "L3"); + } + ASSERT_OK(GenerateAndAddExternalFile( + options, file_data, -1, true, write_global_seqno, + verify_checksums_before_ingest, false, false, &true_data)); + + // This file overlap with files in L4, we will ingest it in L3 + ASSERT_EQ("2,0,0,1,2", FilesPerLevel()); + + size_t kcnt = 0; + VerifyDBFromMap(true_data, &kcnt, false); +} + +TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoMemtableFlush) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + uint64_t entries_in_memtable; + std::map true_data; + + for (int k : {10, 20, 40, 80}) { + ASSERT_OK(Put(Key(k), "memtable")); + true_data[Key(k)] = "memtable"; + } + db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, + &entries_in_memtable); + ASSERT_GE(entries_in_memtable, 1); + + bool write_global_seqno = std::get<0>(GetParam()); + bool verify_checksums_before_ingest = std::get<1>(GetParam()); + // No need for flush + ASSERT_OK(GenerateAndAddExternalFile( + options, {90, 100, 110}, -1, true, write_global_seqno, + verify_checksums_before_ingest, false, false, &true_data)); + db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, + &entries_in_memtable); + ASSERT_GE(entries_in_memtable, 1); + + // This file will flush the memtable + ASSERT_OK(GenerateAndAddExternalFile( + options, {19, 20, 21}, -1, true, write_global_seqno, + verify_checksums_before_ingest, false, false, &true_data)); + db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, + &entries_in_memtable); + ASSERT_EQ(entries_in_memtable, 0); + + for (int k : {200, 201, 205, 206}) { + ASSERT_OK(Put(Key(k), "memtable")); + true_data[Key(k)] = "memtable"; + } + db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, + &entries_in_memtable); + ASSERT_GE(entries_in_memtable, 1); + + // No need for flush, this file keys fit between the memtable keys + ASSERT_OK(GenerateAndAddExternalFile( + options, {202, 203, 204}, -1, true, write_global_seqno, + verify_checksums_before_ingest, false, false, &true_data)); + db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, + &entries_in_memtable); + ASSERT_GE(entries_in_memtable, 1); + + // This file will flush the memtable + ASSERT_OK(GenerateAndAddExternalFile( + options, {206, 207}, -1, true, write_global_seqno, + verify_checksums_before_ingest, false, false, &true_data)); + db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, + &entries_in_memtable); + ASSERT_EQ(entries_in_memtable, 0); + + size_t kcnt = 0; + VerifyDBFromMap(true_data, &kcnt, false); +} + +TEST_P(ExternalSSTFileTest, L0SortingIssue) { + Options options = CurrentOptions(); + options.num_levels = 2; + DestroyAndReopen(options); + std::map true_data; + + ASSERT_OK(Put(Key(1), "memtable")); + ASSERT_OK(Put(Key(10), "memtable")); + + bool write_global_seqno = std::get<0>(GetParam()); + bool verify_checksums_before_ingest = std::get<1>(GetParam()); + // No Flush needed, No global seqno needed, Ingest in L1 + ASSERT_OK( + GenerateAndAddExternalFile(options, {7, 8}, -1, true, write_global_seqno, + verify_checksums_before_ingest, false, false)); + // No Flush needed, but need a global seqno, Ingest in L0 + ASSERT_OK( + GenerateAndAddExternalFile(options, {7, 8}, -1, true, write_global_seqno, + verify_checksums_before_ingest, false, false)); + printf("%s\n", FilesPerLevel().c_str()); + + // Overwrite what we added using external files + ASSERT_OK(Put(Key(7), "memtable")); + ASSERT_OK(Put(Key(8), "memtable")); + + // Read values from memtable + ASSERT_EQ(Get(Key(7)), "memtable"); + ASSERT_EQ(Get(Key(8)), "memtable"); + + // Flush and read from L0 + ASSERT_OK(Flush()); + printf("%s\n", FilesPerLevel().c_str()); + ASSERT_EQ(Get(Key(7)), "memtable"); + ASSERT_EQ(Get(Key(8)), "memtable"); +} + +TEST_F(ExternalSSTFileTest, CompactionDeadlock) { + Options options = CurrentOptions(); + options.num_levels = 2; + options.level0_file_num_compaction_trigger = 4; + options.level0_slowdown_writes_trigger = 4; + options.level0_stop_writes_trigger = 4; + DestroyAndReopen(options); + + // atomic conter of currently running bg threads + std::atomic running_threads(0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"DBImpl::DelayWrite:Wait", "ExternalSSTFileTest::DeadLock:0"}, + {"ExternalSSTFileTest::DeadLock:1", "DBImpl::AddFile:Start"}, + {"DBImpl::AddFile:MutexLock", "ExternalSSTFileTest::DeadLock:2"}, + {"ExternalSSTFileTest::DeadLock:3", "BackgroundCallCompaction:0"}, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Start ingesting and extrnal file in the background + ROCKSDB_NAMESPACE::port::Thread bg_ingest_file([&]() { + running_threads += 1; + ASSERT_OK(GenerateAndAddExternalFile(options, {5, 6})); + running_threads -= 1; + }); + + ASSERT_OK(Put(Key(1), "memtable")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(Key(2), "memtable")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(Key(3), "memtable")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(Key(4), "memtable")); + ASSERT_OK(Flush()); + + // This thread will try to insert into the memtable but since we have 4 L0 + // files this thread will be blocked and hold the writer thread + ROCKSDB_NAMESPACE::port::Thread bg_block_put([&]() { + running_threads += 1; + ASSERT_OK(Put(Key(10), "memtable")); + running_threads -= 1; + }); + + // Make sure DelayWrite is called first + TEST_SYNC_POINT("ExternalSSTFileTest::DeadLock:0"); + + // `DBImpl::AddFile:Start` will wait until we be here + TEST_SYNC_POINT("ExternalSSTFileTest::DeadLock:1"); + + // Wait for IngestExternalFile() to start and aquire mutex + TEST_SYNC_POINT("ExternalSSTFileTest::DeadLock:2"); + + // Now let compaction start + TEST_SYNC_POINT("ExternalSSTFileTest::DeadLock:3"); + + // Wait for max 5 seconds, if we did not finish all bg threads + // then we hit the deadlock bug + for (int i = 0; i < 10; i++) { + if (running_threads.load() == 0) { + break; + } + env_->SleepForMicroseconds(500000); + } + + ASSERT_EQ(running_threads.load(), 0); + + bg_ingest_file.join(); + bg_block_put.join(); +} + +TEST_F(ExternalSSTFileTest, DirtyExit) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + std::string file_path = sst_files_dir_ + "/dirty_exit"; + std::unique_ptr sst_file_writer; + + // Destruct SstFileWriter without calling Finish() + sst_file_writer.reset(new SstFileWriter(EnvOptions(), options)); + ASSERT_OK(sst_file_writer->Open(file_path)); + sst_file_writer.reset(); + + // Destruct SstFileWriter with a failing Finish + sst_file_writer.reset(new SstFileWriter(EnvOptions(), options)); + ASSERT_OK(sst_file_writer->Open(file_path)); + ASSERT_NOK(sst_file_writer->Finish()); +} + +TEST_F(ExternalSSTFileTest, FileWithCFInfo) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"koko", "toto"}, options); + + SstFileWriter sfw_default(EnvOptions(), options, handles_[0]); + SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]); + SstFileWriter sfw_cf2(EnvOptions(), options, handles_[2]); + SstFileWriter sfw_unknown(EnvOptions(), options); + + // default_cf.sst + const std::string cf_default_sst = sst_files_dir_ + "/default_cf.sst"; + ASSERT_OK(sfw_default.Open(cf_default_sst)); + ASSERT_OK(sfw_default.Put("K1", "V1")); + ASSERT_OK(sfw_default.Put("K2", "V2")); + ASSERT_OK(sfw_default.Finish()); + + // cf1.sst + const std::string cf1_sst = sst_files_dir_ + "/cf1.sst"; + ASSERT_OK(sfw_cf1.Open(cf1_sst)); + ASSERT_OK(sfw_cf1.Put("K3", "V1")); + ASSERT_OK(sfw_cf1.Put("K4", "V2")); + ASSERT_OK(sfw_cf1.Finish()); + + // cf_unknown.sst + const std::string unknown_sst = sst_files_dir_ + "/cf_unknown.sst"; + ASSERT_OK(sfw_unknown.Open(unknown_sst)); + ASSERT_OK(sfw_unknown.Put("K5", "V1")); + ASSERT_OK(sfw_unknown.Put("K6", "V2")); + ASSERT_OK(sfw_unknown.Finish()); + + IngestExternalFileOptions ifo; + + // SST CF dont match + ASSERT_NOK(db_->IngestExternalFile(handles_[0], {cf1_sst}, ifo)); + // SST CF dont match + ASSERT_NOK(db_->IngestExternalFile(handles_[2], {cf1_sst}, ifo)); + // SST CF match + ASSERT_OK(db_->IngestExternalFile(handles_[1], {cf1_sst}, ifo)); + + // SST CF dont match + ASSERT_NOK(db_->IngestExternalFile(handles_[1], {cf_default_sst}, ifo)); + // SST CF dont match + ASSERT_NOK(db_->IngestExternalFile(handles_[2], {cf_default_sst}, ifo)); + // SST CF match + ASSERT_OK(db_->IngestExternalFile(handles_[0], {cf_default_sst}, ifo)); + + // SST CF unknown + ASSERT_OK(db_->IngestExternalFile(handles_[1], {unknown_sst}, ifo)); + // SST CF unknown + ASSERT_OK(db_->IngestExternalFile(handles_[2], {unknown_sst}, ifo)); + // SST CF unknown + ASSERT_OK(db_->IngestExternalFile(handles_[0], {unknown_sst}, ifo)); + + // Cannot ingest a file into a dropped CF + ASSERT_OK(db_->DropColumnFamily(handles_[1])); + ASSERT_NOK(db_->IngestExternalFile(handles_[1], {unknown_sst}, ifo)); + + // CF was not dropped, ok to Ingest + ASSERT_OK(db_->IngestExternalFile(handles_[2], {unknown_sst}, ifo)); +} + +/* + * Test and verify the functionality of ingestion_options.move_files and + * ingestion_options.failed_move_fall_back_to_copy + */ +TEST_P(ExternSSTFileLinkFailFallbackTest, LinkFailFallBackExternalSst) { + const bool fail_link = std::get<0>(GetParam()); + const bool failed_move_fall_back_to_copy = std::get<1>(GetParam()); + test_env_->set_fail_link(fail_link); + const EnvOptions env_options; + DestroyAndReopen(options_); + const int kNumKeys = 10000; + IngestExternalFileOptions ifo; + ifo.move_files = true; + ifo.failed_move_fall_back_to_copy = failed_move_fall_back_to_copy; + + std::string file_path = sst_files_dir_ + "file1.sst"; + // Create SstFileWriter for default column family + SstFileWriter sst_file_writer(env_options, options_); + ASSERT_OK(sst_file_writer.Open(file_path)); + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(sst_file_writer.Put(Key(i), Key(i) + "_value")); + } + ASSERT_OK(sst_file_writer.Finish()); + uint64_t file_size = 0; + ASSERT_OK(env_->GetFileSize(file_path, &file_size)); + + bool copyfile = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "ExternalSstFileIngestionJob::Prepare:CopyFile", + [&](void* /* arg */) { copyfile = true; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + const Status s = db_->IngestExternalFile({file_path}, ifo); + + ColumnFamilyHandleImpl* cfh = + static_cast(dbfull()->DefaultColumnFamily()); + ColumnFamilyData* cfd = cfh->cfd(); + const InternalStats* internal_stats_ptr = cfd->internal_stats(); + const std::vector& comp_stats = + internal_stats_ptr->TEST_GetCompactionStats(); + uint64_t bytes_copied = 0; + uint64_t bytes_moved = 0; + for (const auto& stats : comp_stats) { + bytes_copied += stats.bytes_written; + bytes_moved += stats.bytes_moved; + } + + if (!fail_link) { + // Link operation succeeds. External SST should be moved. + ASSERT_OK(s); + ASSERT_EQ(0, bytes_copied); + ASSERT_EQ(file_size, bytes_moved); + ASSERT_FALSE(copyfile); + } else { + // Link operation fails. + ASSERT_EQ(0, bytes_moved); + if (failed_move_fall_back_to_copy) { + ASSERT_OK(s); + // Copy file is true since a failed link falls back to copy file. + ASSERT_TRUE(copyfile); + ASSERT_EQ(file_size, bytes_copied); + } else { + ASSERT_TRUE(s.IsNotSupported()); + // Copy file is false since a failed link does not fall back to copy file. + ASSERT_FALSE(copyfile); + ASSERT_EQ(0, bytes_copied); + } + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +class TestIngestExternalFileListener : public EventListener { + public: + void OnExternalFileIngested(DB* /*db*/, + const ExternalFileIngestionInfo& info) override { + ingested_files.push_back(info); + } + + std::vector ingested_files; +}; + +TEST_P(ExternalSSTFileTest, IngestionListener) { + Options options = CurrentOptions(); + TestIngestExternalFileListener* listener = + new TestIngestExternalFileListener(); + options.listeners.emplace_back(listener); + CreateAndReopenWithCF({"koko", "toto"}, options); + + bool write_global_seqno = std::get<0>(GetParam()); + bool verify_checksums_before_ingest = std::get<1>(GetParam()); + // Ingest into default cf + ASSERT_OK(GenerateAndAddExternalFile( + options, {1, 2}, -1, true, write_global_seqno, + verify_checksums_before_ingest, false, true, nullptr, handles_[0])); + ASSERT_EQ(listener->ingested_files.size(), 1); + ASSERT_EQ(listener->ingested_files.back().cf_name, "default"); + ASSERT_EQ(listener->ingested_files.back().global_seqno, 0); + ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_id, + 0); + ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_name, + "default"); + + // Ingest into cf1 + ASSERT_OK(GenerateAndAddExternalFile( + options, {1, 2}, -1, true, write_global_seqno, + verify_checksums_before_ingest, false, true, nullptr, handles_[1])); + ASSERT_EQ(listener->ingested_files.size(), 2); + ASSERT_EQ(listener->ingested_files.back().cf_name, "koko"); + ASSERT_EQ(listener->ingested_files.back().global_seqno, 0); + ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_id, + 1); + ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_name, + "koko"); + + // Ingest into cf2 + ASSERT_OK(GenerateAndAddExternalFile( + options, {1, 2}, -1, true, write_global_seqno, + verify_checksums_before_ingest, false, true, nullptr, handles_[2])); + ASSERT_EQ(listener->ingested_files.size(), 3); + ASSERT_EQ(listener->ingested_files.back().cf_name, "toto"); + ASSERT_EQ(listener->ingested_files.back().global_seqno, 0); + ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_id, + 2); + ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_name, + "toto"); +} + +TEST_F(ExternalSSTFileTest, SnapshotInconsistencyBug) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + const int kNumKeys = 10000; + + // Insert keys using normal path and take a snapshot + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(Put(Key(i), Key(i) + "_V1")); + } + const Snapshot* snap = db_->GetSnapshot(); + + // Overwrite all keys using IngestExternalFile + std::string sst_file_path = sst_files_dir_ + "file1.sst"; + SstFileWriter sst_file_writer(EnvOptions(), options); + ASSERT_OK(sst_file_writer.Open(sst_file_path)); + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(sst_file_writer.Put(Key(i), Key(i) + "_V2")); + } + ASSERT_OK(sst_file_writer.Finish()); + + IngestExternalFileOptions ifo; + ifo.move_files = true; + ASSERT_OK(db_->IngestExternalFile({sst_file_path}, ifo)); + + for (int i = 0; i < kNumKeys; i++) { + ASSERT_EQ(Get(Key(i), snap), Key(i) + "_V1"); + ASSERT_EQ(Get(Key(i)), Key(i) + "_V2"); + } + + db_->ReleaseSnapshot(snap); +} + +TEST_P(ExternalSSTFileTest, IngestBehind) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 3; + options.disable_auto_compactions = false; + DestroyAndReopen(options); + std::vector> file_data; + std::map true_data; + + // Insert 100 -> 200 into the memtable + for (int i = 100; i <= 200; i++) { + ASSERT_OK(Put(Key(i), "memtable")); + true_data[Key(i)] = "memtable"; + } + + // Insert 100 -> 200 using IngestExternalFile + file_data.clear(); + for (int i = 0; i <= 20; i++) { + file_data.emplace_back(Key(i), "ingest_behind"); + } + + bool allow_global_seqno = true; + bool ingest_behind = true; + bool write_global_seqno = std::get<0>(GetParam()); + bool verify_checksums_before_ingest = std::get<1>(GetParam()); + + // Can't ingest behind since allow_ingest_behind isn't set to true + ASSERT_NOK(GenerateAndAddExternalFile( + options, file_data, -1, allow_global_seqno, write_global_seqno, + verify_checksums_before_ingest, ingest_behind, false /*sort_data*/, + &true_data)); + + options.allow_ingest_behind = true; + // check that we still can open the DB, as num_levels should be + // sanitized to 3 + options.num_levels = 2; + DestroyAndReopen(options); + + options.num_levels = 3; + DestroyAndReopen(options); + // Insert 100 -> 200 into the memtable + for (int i = 100; i <= 200; i++) { + ASSERT_OK(Put(Key(i), "memtable")); + true_data[Key(i)] = "memtable"; + } + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + // Universal picker should go at second from the bottom level + ASSERT_EQ("0,1", FilesPerLevel()); + ASSERT_OK(GenerateAndAddExternalFile( + options, file_data, -1, allow_global_seqno, write_global_seqno, + verify_checksums_before_ingest, true /*ingest_behind*/, + false /*sort_data*/, &true_data)); + ASSERT_EQ("0,1,1", FilesPerLevel()); + // this time ingest should fail as the file doesn't fit to the bottom level + ASSERT_NOK(GenerateAndAddExternalFile( + options, file_data, -1, allow_global_seqno, write_global_seqno, + verify_checksums_before_ingest, true /*ingest_behind*/, + false /*sort_data*/, &true_data)); + ASSERT_EQ("0,1,1", FilesPerLevel()); + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + // bottom level should be empty + ASSERT_EQ("0,1", FilesPerLevel()); + + size_t kcnt = 0; + VerifyDBFromMap(true_data, &kcnt, false); +} + +TEST_F(ExternalSSTFileTest, SkipBloomFilter) { + Options options = CurrentOptions(); + + BlockBasedTableOptions table_options; + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); + table_options.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + + // Create external SST file and include bloom filters + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + DestroyAndReopen(options); + { + std::string file_path = sst_files_dir_ + "sst_with_bloom.sst"; + SstFileWriter sst_file_writer(EnvOptions(), options); + ASSERT_OK(sst_file_writer.Open(file_path)); + ASSERT_OK(sst_file_writer.Put("Key1", "Value1")); + ASSERT_OK(sst_file_writer.Finish()); + + ASSERT_OK( + db_->IngestExternalFile({file_path}, IngestExternalFileOptions())); + + ASSERT_EQ(Get("Key1"), "Value1"); + ASSERT_GE( + options.statistics->getTickerCount(Tickers::BLOCK_CACHE_FILTER_ADD), 1); + } + + // Create external SST file but skip bloom filters + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + DestroyAndReopen(options); + { + std::string file_path = sst_files_dir_ + "sst_with_no_bloom.sst"; + SstFileWriter sst_file_writer(EnvOptions(), options, nullptr, true, + Env::IOPriority::IO_TOTAL, + true /* skip_filters */); + ASSERT_OK(sst_file_writer.Open(file_path)); + ASSERT_OK(sst_file_writer.Put("Key1", "Value1")); + ASSERT_OK(sst_file_writer.Finish()); + + ASSERT_OK( + db_->IngestExternalFile({file_path}, IngestExternalFileOptions())); + + ASSERT_EQ(Get("Key1"), "Value1"); + ASSERT_EQ( + options.statistics->getTickerCount(Tickers::BLOCK_CACHE_FILTER_ADD), 0); + } +} + +TEST_F(ExternalSSTFileTest, IngestFileWrittenWithCompressionDictionary) { + if (!ZSTD_Supported()) { + return; + } + const int kNumEntries = 1 << 10; + const int kNumBytesPerEntry = 1 << 10; + Options options = CurrentOptions(); + options.compression = kZSTD; + options.compression_opts.max_dict_bytes = 1 << 14; // 16KB + options.compression_opts.zstd_max_train_bytes = 1 << 18; // 256KB + DestroyAndReopen(options); + + std::atomic num_compression_dicts(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict", + [&](void* /* arg */) { ++num_compression_dicts; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + std::vector> random_data; + for (int i = 0; i < kNumEntries; i++) { + std::string val; + test::RandomString(&rnd, kNumBytesPerEntry, &val); + random_data.emplace_back(Key(i), std::move(val)); + } + ASSERT_OK(GenerateAndAddExternalFile(options, std::move(random_data))); + ASSERT_EQ(1, num_compression_dicts); +} + +TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_Success) { + std::unique_ptr fault_injection_env( + new FaultInjectionTestEnv(env_)); + Options options = CurrentOptions(); + options.env = fault_injection_env.get(); + CreateAndReopenWithCF({"pikachu", "eevee"}, options); + std::vector column_families; + column_families.push_back(handles_[0]); + column_families.push_back(handles_[1]); + column_families.push_back(handles_[2]); + std::vector ifos(column_families.size()); + for (auto& ifo : ifos) { + ifo.allow_global_seqno = true; // Always allow global_seqno + // May or may not write global_seqno + ifo.write_global_seqno = std::get<0>(GetParam()); + // Whether to verify checksums before ingestion + ifo.verify_checksums_before_ingest = std::get<1>(GetParam()); + } + std::vector>> data; + data.push_back( + {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")}); + data.push_back( + {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")}); + data.push_back( + {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")}); + + // Resize the true_data vector upon construction to avoid re-alloc + std::vector> true_data( + column_families.size()); + Status s = GenerateAndAddExternalFiles(options, column_families, ifos, data, + -1, true, true_data); + ASSERT_OK(s); + Close(); + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"}, + options); + ASSERT_EQ(3, handles_.size()); + int cf = 0; + for (const auto& verify_map : true_data) { + for (const auto& elem : verify_map) { + const std::string& key = elem.first; + const std::string& value = elem.second; + ASSERT_EQ(value, Get(cf, key)); + } + ++cf; + } + Close(); + Destroy(options, true /* delete_cf_paths */); +} + +TEST_P(ExternalSSTFileTest, + IngestFilesIntoMultipleColumnFamilies_NoMixedStateWithSnapshot) { + std::unique_ptr fault_injection_env( + new FaultInjectionTestEnv(env_)); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->LoadDependency({ + {"DBImpl::IngestExternalFiles:InstallSVForFirstCF:0", + "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_MixedState:" + "BeforeRead"}, + {"ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_MixedState:" + "AfterRead", + "DBImpl::IngestExternalFiles:InstallSVForFirstCF:1"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.env = fault_injection_env.get(); + CreateAndReopenWithCF({"pikachu", "eevee"}, options); + const std::vector> data_before_ingestion = + {{{"foo1", "fv1_0"}, {"foo2", "fv2_0"}, {"foo3", "fv3_0"}}, + {{"bar1", "bv1_0"}, {"bar2", "bv2_0"}, {"bar3", "bv3_0"}}, + {{"bar4", "bv4_0"}, {"bar5", "bv5_0"}, {"bar6", "bv6_0"}}}; + for (size_t i = 0; i != handles_.size(); ++i) { + int cf = static_cast(i); + const auto& orig_data = data_before_ingestion[i]; + for (const auto& kv : orig_data) { + ASSERT_OK(Put(cf, kv.first, kv.second)); + } + ASSERT_OK(Flush(cf)); + } + + std::vector column_families; + column_families.push_back(handles_[0]); + column_families.push_back(handles_[1]); + column_families.push_back(handles_[2]); + std::vector ifos(column_families.size()); + for (auto& ifo : ifos) { + ifo.allow_global_seqno = true; // Always allow global_seqno + // May or may not write global_seqno + ifo.write_global_seqno = std::get<0>(GetParam()); + // Whether to verify checksums before ingestion + ifo.verify_checksums_before_ingest = std::get<1>(GetParam()); + } + std::vector>> data; + data.push_back( + {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")}); + data.push_back( + {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")}); + data.push_back( + {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")}); + // Resize the true_data vector upon construction to avoid re-alloc + std::vector> true_data( + column_families.size()); + // Take snapshot before ingestion starts + ReadOptions read_opts; + read_opts.total_order_seek = true; + read_opts.snapshot = dbfull()->GetSnapshot(); + std::vector iters(handles_.size()); + + // Range scan checks first kv of each CF before ingestion starts. + for (size_t i = 0; i != handles_.size(); ++i) { + iters[i] = dbfull()->NewIterator(read_opts, handles_[i]); + iters[i]->SeekToFirst(); + ASSERT_TRUE(iters[i]->Valid()); + const std::string& key = iters[i]->key().ToString(); + const std::string& value = iters[i]->value().ToString(); + const std::map& orig_data = + data_before_ingestion[i]; + std::map::const_iterator it = orig_data.find(key); + ASSERT_NE(orig_data.end(), it); + ASSERT_EQ(it->second, value); + iters[i]->Next(); + } + port::Thread ingest_thread([&]() { + ASSERT_OK(GenerateAndAddExternalFiles(options, column_families, ifos, data, + -1, true, true_data)); + }); + TEST_SYNC_POINT( + "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_MixedState:" + "BeforeRead"); + // Should see only data before ingestion + for (size_t i = 0; i != handles_.size(); ++i) { + const auto& orig_data = data_before_ingestion[i]; + for (; iters[i]->Valid(); iters[i]->Next()) { + const std::string& key = iters[i]->key().ToString(); + const std::string& value = iters[i]->value().ToString(); + std::map::const_iterator it = + orig_data.find(key); + ASSERT_NE(orig_data.end(), it); + ASSERT_EQ(it->second, value); + } + } + TEST_SYNC_POINT( + "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_MixedState:" + "AfterRead"); + ingest_thread.join(); + for (auto* iter : iters) { + delete iter; + } + iters.clear(); + dbfull()->ReleaseSnapshot(read_opts.snapshot); + + Close(); + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"}, + options); + // Should see consistent state after ingestion for all column families even + // without snapshot. + ASSERT_EQ(3, handles_.size()); + int cf = 0; + for (const auto& verify_map : true_data) { + for (const auto& elem : verify_map) { + const std::string& key = elem.first; + const std::string& value = elem.second; + ASSERT_EQ(value, Get(cf, key)); + } + ++cf; + } + Close(); + Destroy(options, true /* delete_cf_paths */); +} + +TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_PrepareFail) { + std::unique_ptr fault_injection_env( + new FaultInjectionTestEnv(env_)); + Options options = CurrentOptions(); + options.env = fault_injection_env.get(); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->LoadDependency({ + {"DBImpl::IngestExternalFiles:BeforeLastJobPrepare:0", + "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_PrepareFail:" + "0"}, + {"ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies:PrepareFail:" + "1", + "DBImpl::IngestExternalFiles:BeforeLastJobPrepare:1"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + CreateAndReopenWithCF({"pikachu", "eevee"}, options); + std::vector column_families; + column_families.push_back(handles_[0]); + column_families.push_back(handles_[1]); + column_families.push_back(handles_[2]); + std::vector ifos(column_families.size()); + for (auto& ifo : ifos) { + ifo.allow_global_seqno = true; // Always allow global_seqno + // May or may not write global_seqno + ifo.write_global_seqno = std::get<0>(GetParam()); + // Whether to verify block checksums before ingest + ifo.verify_checksums_before_ingest = std::get<1>(GetParam()); + } + std::vector>> data; + data.push_back( + {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")}); + data.push_back( + {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")}); + data.push_back( + {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")}); + + // Resize the true_data vector upon construction to avoid re-alloc + std::vector> true_data( + column_families.size()); + port::Thread ingest_thread([&]() { + Status s = GenerateAndAddExternalFiles(options, column_families, ifos, data, + -1, true, true_data); + ASSERT_NOK(s); + }); + TEST_SYNC_POINT( + "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_PrepareFail:" + "0"); + fault_injection_env->SetFilesystemActive(false); + TEST_SYNC_POINT( + "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies:PrepareFail:" + "1"); + ingest_thread.join(); + + fault_injection_env->SetFilesystemActive(true); + Close(); + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"}, + options); + ASSERT_EQ(3, handles_.size()); + int cf = 0; + for (const auto& verify_map : true_data) { + for (const auto& elem : verify_map) { + const std::string& key = elem.first; + ASSERT_EQ("NOT_FOUND", Get(cf, key)); + } + ++cf; + } + Close(); + Destroy(options, true /* delete_cf_paths */); +} + +TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_CommitFail) { + std::unique_ptr fault_injection_env( + new FaultInjectionTestEnv(env_)); + Options options = CurrentOptions(); + options.env = fault_injection_env.get(); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->LoadDependency({ + {"DBImpl::IngestExternalFiles:BeforeJobsRun:0", + "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_CommitFail:" + "0"}, + {"ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_CommitFail:" + "1", + "DBImpl::IngestExternalFiles:BeforeJobsRun:1"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + CreateAndReopenWithCF({"pikachu", "eevee"}, options); + std::vector column_families; + column_families.push_back(handles_[0]); + column_families.push_back(handles_[1]); + column_families.push_back(handles_[2]); + std::vector ifos(column_families.size()); + for (auto& ifo : ifos) { + ifo.allow_global_seqno = true; // Always allow global_seqno + // May or may not write global_seqno + ifo.write_global_seqno = std::get<0>(GetParam()); + // Whether to verify block checksums before ingestion + ifo.verify_checksums_before_ingest = std::get<1>(GetParam()); + } + std::vector>> data; + data.push_back( + {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")}); + data.push_back( + {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")}); + data.push_back( + {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")}); + // Resize the true_data vector upon construction to avoid re-alloc + std::vector> true_data( + column_families.size()); + port::Thread ingest_thread([&]() { + Status s = GenerateAndAddExternalFiles(options, column_families, ifos, data, + -1, true, true_data); + ASSERT_NOK(s); + }); + TEST_SYNC_POINT( + "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_CommitFail:" + "0"); + fault_injection_env->SetFilesystemActive(false); + TEST_SYNC_POINT( + "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_CommitFail:" + "1"); + ingest_thread.join(); + + fault_injection_env->SetFilesystemActive(true); + Close(); + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"}, + options); + ASSERT_EQ(3, handles_.size()); + int cf = 0; + for (const auto& verify_map : true_data) { + for (const auto& elem : verify_map) { + const std::string& key = elem.first; + ASSERT_EQ("NOT_FOUND", Get(cf, key)); + } + ++cf; + } + Close(); + Destroy(options, true /* delete_cf_paths */); +} + +TEST_P(ExternalSSTFileTest, + IngestFilesIntoMultipleColumnFamilies_PartialManifestWriteFail) { + std::unique_ptr fault_injection_env( + new FaultInjectionTestEnv(env_)); + Options options = CurrentOptions(); + options.env = fault_injection_env.get(); + + CreateAndReopenWithCF({"pikachu", "eevee"}, options); + + SyncPoint::GetInstance()->ClearTrace(); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->LoadDependency({ + {"VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0", + "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_" + "PartialManifestWriteFail:0"}, + {"ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_" + "PartialManifestWriteFail:1", + "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:1"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + + std::vector column_families; + column_families.push_back(handles_[0]); + column_families.push_back(handles_[1]); + column_families.push_back(handles_[2]); + std::vector ifos(column_families.size()); + for (auto& ifo : ifos) { + ifo.allow_global_seqno = true; // Always allow global_seqno + // May or may not write global_seqno + ifo.write_global_seqno = std::get<0>(GetParam()); + // Whether to verify block checksums before ingestion + ifo.verify_checksums_before_ingest = std::get<1>(GetParam()); + } + std::vector>> data; + data.push_back( + {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")}); + data.push_back( + {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")}); + data.push_back( + {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")}); + // Resize the true_data vector upon construction to avoid re-alloc + std::vector> true_data( + column_families.size()); + port::Thread ingest_thread([&]() { + Status s = GenerateAndAddExternalFiles(options, column_families, ifos, data, + -1, true, true_data); + ASSERT_NOK(s); + }); + TEST_SYNC_POINT( + "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_" + "PartialManifestWriteFail:0"); + fault_injection_env->SetFilesystemActive(false); + TEST_SYNC_POINT( + "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_" + "PartialManifestWriteFail:1"); + ingest_thread.join(); + + fault_injection_env->DropUnsyncedFileData(); + fault_injection_env->SetFilesystemActive(true); + Close(); + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"}, + options); + ASSERT_EQ(3, handles_.size()); + int cf = 0; + for (const auto& verify_map : true_data) { + for (const auto& elem : verify_map) { + const std::string& key = elem.first; + ASSERT_EQ("NOT_FOUND", Get(cf, key)); + } + ++cf; + } + Close(); + Destroy(options, true /* delete_cf_paths */); +} + +TEST_P(ExternalSSTFileTest, IngestFilesTriggerFlushingWithTwoWriteQueue) { + Options options = CurrentOptions(); + // Use large buffer to avoid memtable flush + options.write_buffer_size = 1024 * 1024; + options.two_write_queues = true; + DestroyAndReopen(options); + + ASSERT_OK(dbfull()->Put(WriteOptions(), "1000", "v1")); + ASSERT_OK(dbfull()->Put(WriteOptions(), "1001", "v1")); + ASSERT_OK(dbfull()->Put(WriteOptions(), "9999", "v1")); + + // Put one key which is overlap with keys in memtable. + // It will trigger flushing memtable and require this thread is + // currently at the front of the 2nd writer queue. We must make + // sure that it won't enter the 2nd writer queue for the second time. + std::vector> data; + data.push_back(std::make_pair("1001", "v2")); + GenerateAndAddExternalFile(options, data); +} + +INSTANTIATE_TEST_CASE_P(ExternalSSTFileTest, ExternalSSTFileTest, + testing::Values(std::make_tuple(false, false), + std::make_tuple(false, true), + std::make_tuple(true, false), + std::make_tuple(true, true))); + +INSTANTIATE_TEST_CASE_P(ExternSSTFileLinkFailFallbackTest, + ExternSSTFileLinkFailFallbackTest, + testing::Values(std::make_tuple(true, false), + std::make_tuple(true, true), + std::make_tuple(false, false))); + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, + "SKIPPED as External SST File Writer and Ingestion are not supported " + "in ROCKSDB_LITE\n"); + return 0; +} + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/fault_injection_test.cc b/src/rocksdb/db/fault_injection_test.cc new file mode 100644 index 000000000..f4ca3458a --- /dev/null +++ b/src/rocksdb/db/fault_injection_test.cc @@ -0,0 +1,555 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright 2014 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// This test uses a custom Env to keep track of the state of a filesystem as of +// the last "sync". It then checks for data loss errors by purposely dropping +// file data (or entire files) not protected by a "sync". + +#include "db/db_impl/db_impl.h" +#include "db/log_format.h" +#include "db/version_set.h" +#include "env/mock_env.h" +#include "file/filename.h" +#include "logging/logging.h" +#include "rocksdb/cache.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/table.h" +#include "rocksdb/write_batch.h" +#include "test_util/fault_injection_test_env.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { + +static const int kValueSize = 1000; +static const int kMaxNumValues = 2000; +static const size_t kNumIterations = 3; + +enum FaultInjectionOptionConfig { + kDefault, + kDifferentDataDir, + kWalDir, + kSyncWal, + kWalDirSyncWal, + kMultiLevels, + kEnd, +}; +class FaultInjectionTest + : public testing::Test, + public testing::WithParamInterface> { + protected: + int option_config_; + int non_inclusive_end_range_; // kEnd or equivalent to that + // When need to make sure data is persistent, sync WAL + bool sync_use_wal_; + // When need to make sure data is persistent, call DB::CompactRange() + bool sync_use_compact_; + + bool sequential_order_; + + protected: + public: + enum ExpectedVerifResult { kValExpectFound, kValExpectNoError }; + enum ResetMethod { + kResetDropUnsyncedData, + kResetDropRandomUnsyncedData, + kResetDeleteUnsyncedFiles, + kResetDropAndDeleteUnsynced + }; + + std::unique_ptr base_env_; + FaultInjectionTestEnv* env_; + std::string dbname_; + std::shared_ptr tiny_cache_; + Options options_; + DB* db_; + + FaultInjectionTest() + : option_config_(std::get<1>(GetParam())), + non_inclusive_end_range_(std::get<2>(GetParam())), + sync_use_wal_(false), + sync_use_compact_(true), + base_env_(nullptr), + env_(nullptr), + db_(nullptr) {} + + ~FaultInjectionTest() override { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + } + + bool ChangeOptions() { + option_config_++; + if (option_config_ >= non_inclusive_end_range_) { + return false; + } else { + if (option_config_ == kMultiLevels) { + base_env_.reset(new MockEnv(Env::Default())); + } + return true; + } + } + + // Return the current option configuration. + Options CurrentOptions() { + sync_use_wal_ = false; + sync_use_compact_ = true; + Options options; + switch (option_config_) { + case kWalDir: + options.wal_dir = test::PerThreadDBPath(env_, "fault_test_wal"); + break; + case kDifferentDataDir: + options.db_paths.emplace_back( + test::PerThreadDBPath(env_, "fault_test_data"), 1000000U); + break; + case kSyncWal: + sync_use_wal_ = true; + sync_use_compact_ = false; + break; + case kWalDirSyncWal: + options.wal_dir = test::PerThreadDBPath(env_, "/fault_test_wal"); + sync_use_wal_ = true; + sync_use_compact_ = false; + break; + case kMultiLevels: + options.write_buffer_size = 64 * 1024; + options.target_file_size_base = 64 * 1024; + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 2; + options.level0_stop_writes_trigger = 4; + options.max_bytes_for_level_base = 128 * 1024; + options.max_write_buffer_number = 2; + options.max_background_compactions = 8; + options.max_background_flushes = 8; + sync_use_wal_ = true; + sync_use_compact_ = false; + break; + default: + break; + } + return options; + } + + Status NewDB() { + assert(db_ == nullptr); + assert(tiny_cache_ == nullptr); + assert(env_ == nullptr); + + env_ = + new FaultInjectionTestEnv(base_env_ ? base_env_.get() : Env::Default()); + + options_ = CurrentOptions(); + options_.env = env_; + options_.paranoid_checks = true; + + BlockBasedTableOptions table_options; + tiny_cache_ = NewLRUCache(100); + table_options.block_cache = tiny_cache_; + options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + dbname_ = test::PerThreadDBPath("fault_test"); + + EXPECT_OK(DestroyDB(dbname_, options_)); + + options_.create_if_missing = true; + Status s = OpenDB(); + options_.create_if_missing = false; + return s; + } + + void SetUp() override { + sequential_order_ = std::get<0>(GetParam()); + ASSERT_OK(NewDB()); + } + + void TearDown() override { + CloseDB(); + + Status s = DestroyDB(dbname_, options_); + + delete env_; + env_ = nullptr; + + tiny_cache_.reset(); + + ASSERT_OK(s); + } + + void Build(const WriteOptions& write_options, int start_idx, int num_vals) { + std::string key_space, value_space; + WriteBatch batch; + for (int i = start_idx; i < start_idx + num_vals; i++) { + Slice key = Key(i, &key_space); + batch.Clear(); + batch.Put(key, Value(i, &value_space)); + ASSERT_OK(db_->Write(write_options, &batch)); + } + } + + Status ReadValue(int i, std::string* val) const { + std::string key_space, value_space; + Slice key = Key(i, &key_space); + Value(i, &value_space); + ReadOptions options; + return db_->Get(options, key, val); + } + + Status Verify(int start_idx, int num_vals, + ExpectedVerifResult expected) const { + std::string val; + std::string value_space; + Status s; + for (int i = start_idx; i < start_idx + num_vals && s.ok(); i++) { + Value(i, &value_space); + s = ReadValue(i, &val); + if (s.ok()) { + EXPECT_EQ(value_space, val); + } + if (expected == kValExpectFound) { + if (!s.ok()) { + fprintf(stderr, "Error when read %dth record (expect found): %s\n", i, + s.ToString().c_str()); + return s; + } + } else if (!s.ok() && !s.IsNotFound()) { + fprintf(stderr, "Error when read %dth record: %s\n", i, + s.ToString().c_str()); + return s; + } + } + return Status::OK(); + } + + // Return the ith key + Slice Key(int i, std::string* storage) const { + unsigned long long num = i; + if (!sequential_order_) { + // random transfer + const int m = 0x5bd1e995; + num *= m; + num ^= num << 24; + } + char buf[100]; + snprintf(buf, sizeof(buf), "%016d", static_cast(num)); + storage->assign(buf, strlen(buf)); + return Slice(*storage); + } + + // Return the value to associate with the specified key + Slice Value(int k, std::string* storage) const { + Random r(k); + return test::RandomString(&r, kValueSize, storage); + } + + void CloseDB() { + delete db_; + db_ = nullptr; + } + + Status OpenDB() { + CloseDB(); + env_->ResetState(); + Status s = DB::Open(options_, dbname_, &db_); + assert(db_ != nullptr); + return s; + } + + void DeleteAllData() { + Iterator* iter = db_->NewIterator(ReadOptions()); + WriteOptions options; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(db_->Delete(WriteOptions(), iter->key())); + } + + delete iter; + + FlushOptions flush_options; + flush_options.wait = true; + db_->Flush(flush_options); + } + + // rnd cannot be null for kResetDropRandomUnsyncedData + void ResetDBState(ResetMethod reset_method, Random* rnd = nullptr) { + env_->AssertNoOpenFile(); + switch (reset_method) { + case kResetDropUnsyncedData: + ASSERT_OK(env_->DropUnsyncedFileData()); + break; + case kResetDropRandomUnsyncedData: + ASSERT_OK(env_->DropRandomUnsyncedFileData(rnd)); + break; + case kResetDeleteUnsyncedFiles: + ASSERT_OK(env_->DeleteFilesCreatedAfterLastDirSync()); + break; + case kResetDropAndDeleteUnsynced: + ASSERT_OK(env_->DropUnsyncedFileData()); + ASSERT_OK(env_->DeleteFilesCreatedAfterLastDirSync()); + break; + default: + assert(false); + } + } + + void PartialCompactTestPreFault(int num_pre_sync, int num_post_sync) { + DeleteAllData(); + + WriteOptions write_options; + write_options.sync = sync_use_wal_; + + Build(write_options, 0, num_pre_sync); + if (sync_use_compact_) { + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + } + write_options.sync = false; + Build(write_options, num_pre_sync, num_post_sync); + } + + void PartialCompactTestReopenWithFault(ResetMethod reset_method, + int num_pre_sync, int num_post_sync, + Random* rnd = nullptr) { + env_->SetFilesystemActive(false); + CloseDB(); + ResetDBState(reset_method, rnd); + ASSERT_OK(OpenDB()); + ASSERT_OK(Verify(0, num_pre_sync, FaultInjectionTest::kValExpectFound)); + ASSERT_OK(Verify(num_pre_sync, num_post_sync, + FaultInjectionTest::kValExpectNoError)); + WaitCompactionFinish(); + ASSERT_OK(Verify(0, num_pre_sync, FaultInjectionTest::kValExpectFound)); + ASSERT_OK(Verify(num_pre_sync, num_post_sync, + FaultInjectionTest::kValExpectNoError)); + } + + void NoWriteTestPreFault() { + } + + void NoWriteTestReopenWithFault(ResetMethod reset_method) { + CloseDB(); + ResetDBState(reset_method); + ASSERT_OK(OpenDB()); + } + + void WaitCompactionFinish() { + static_cast(db_->GetRootDB())->TEST_WaitForCompact(); + ASSERT_OK(db_->Put(WriteOptions(), "", "")); + } +}; + +class FaultInjectionTestSplitted : public FaultInjectionTest {}; + +TEST_P(FaultInjectionTestSplitted, FaultTest) { + do { + Random rnd(301); + + for (size_t idx = 0; idx < kNumIterations; idx++) { + int num_pre_sync = rnd.Uniform(kMaxNumValues); + int num_post_sync = rnd.Uniform(kMaxNumValues); + + PartialCompactTestPreFault(num_pre_sync, num_post_sync); + PartialCompactTestReopenWithFault(kResetDropUnsyncedData, num_pre_sync, + num_post_sync); + NoWriteTestPreFault(); + NoWriteTestReopenWithFault(kResetDropUnsyncedData); + + PartialCompactTestPreFault(num_pre_sync, num_post_sync); + PartialCompactTestReopenWithFault(kResetDropRandomUnsyncedData, + num_pre_sync, num_post_sync, &rnd); + NoWriteTestPreFault(); + NoWriteTestReopenWithFault(kResetDropUnsyncedData); + + // Setting a separate data path won't pass the test as we don't sync + // it after creating new files, + PartialCompactTestPreFault(num_pre_sync, num_post_sync); + PartialCompactTestReopenWithFault(kResetDropAndDeleteUnsynced, + num_pre_sync, num_post_sync); + NoWriteTestPreFault(); + NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced); + + PartialCompactTestPreFault(num_pre_sync, num_post_sync); + // No new files created so we expect all values since no files will be + // dropped. + PartialCompactTestReopenWithFault(kResetDeleteUnsyncedFiles, num_pre_sync, + num_post_sync); + NoWriteTestPreFault(); + NoWriteTestReopenWithFault(kResetDeleteUnsyncedFiles); + } + } while (ChangeOptions()); +} + +// Previous log file is not fsynced if sync is forced after log rolling. +TEST_P(FaultInjectionTest, WriteOptionSyncTest) { + test::SleepingBackgroundTask sleeping_task_low; + env_->SetBackgroundThreads(1, Env::HIGH); + // Block the job queue to prevent flush job from running. + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::HIGH); + sleeping_task_low.WaitUntilSleeping(); + + WriteOptions write_options; + write_options.sync = false; + + std::string key_space, value_space; + ASSERT_OK( + db_->Put(write_options, Key(1, &key_space), Value(1, &value_space))); + FlushOptions flush_options; + flush_options.wait = false; + ASSERT_OK(db_->Flush(flush_options)); + write_options.sync = true; + ASSERT_OK( + db_->Put(write_options, Key(2, &key_space), Value(2, &value_space))); + db_->FlushWAL(false); + + env_->SetFilesystemActive(false); + NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced); + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); + + ASSERT_OK(OpenDB()); + std::string val; + Value(2, &value_space); + ASSERT_OK(ReadValue(2, &val)); + ASSERT_EQ(value_space, val); + + Value(1, &value_space); + ASSERT_OK(ReadValue(1, &val)); + ASSERT_EQ(value_space, val); +} + +TEST_P(FaultInjectionTest, UninstalledCompaction) { + options_.target_file_size_base = 32 * 1024; + options_.write_buffer_size = 100 << 10; // 100KB + options_.level0_file_num_compaction_trigger = 6; + options_.level0_stop_writes_trigger = 1 << 10; + options_.level0_slowdown_writes_trigger = 1 << 10; + options_.max_background_compactions = 1; + OpenDB(); + + if (!sequential_order_) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"FaultInjectionTest::FaultTest:0", "DBImpl::BGWorkCompaction"}, + {"CompactionJob::Run():End", "FaultInjectionTest::FaultTest:1"}, + {"FaultInjectionTest::FaultTest:2", + "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"}, + }); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + int kNumKeys = 1000; + Build(WriteOptions(), 0, kNumKeys); + FlushOptions flush_options; + flush_options.wait = true; + db_->Flush(flush_options); + ASSERT_OK(db_->Put(WriteOptions(), "", "")); + TEST_SYNC_POINT("FaultInjectionTest::FaultTest:0"); + TEST_SYNC_POINT("FaultInjectionTest::FaultTest:1"); + env_->SetFilesystemActive(false); + TEST_SYNC_POINT("FaultInjectionTest::FaultTest:2"); + CloseDB(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ResetDBState(kResetDropUnsyncedData); + + std::atomic opened(false); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::Open:Opened", [&](void* /*arg*/) { opened.store(true); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BGWorkCompaction", + [&](void* /*arg*/) { ASSERT_TRUE(opened.load()); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(OpenDB()); + ASSERT_OK(Verify(0, kNumKeys, FaultInjectionTest::kValExpectFound)); + WaitCompactionFinish(); + ASSERT_OK(Verify(0, kNumKeys, FaultInjectionTest::kValExpectFound)); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(FaultInjectionTest, ManualLogSyncTest) { + test::SleepingBackgroundTask sleeping_task_low; + env_->SetBackgroundThreads(1, Env::HIGH); + // Block the job queue to prevent flush job from running. + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::HIGH); + sleeping_task_low.WaitUntilSleeping(); + + WriteOptions write_options; + write_options.sync = false; + + std::string key_space, value_space; + ASSERT_OK( + db_->Put(write_options, Key(1, &key_space), Value(1, &value_space))); + FlushOptions flush_options; + flush_options.wait = false; + ASSERT_OK(db_->Flush(flush_options)); + ASSERT_OK( + db_->Put(write_options, Key(2, &key_space), Value(2, &value_space))); + ASSERT_OK(db_->FlushWAL(true)); + + env_->SetFilesystemActive(false); + NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced); + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); + + ASSERT_OK(OpenDB()); + std::string val; + Value(2, &value_space); + ASSERT_OK(ReadValue(2, &val)); + ASSERT_EQ(value_space, val); + + Value(1, &value_space); + ASSERT_OK(ReadValue(1, &val)); + ASSERT_EQ(value_space, val); +} + +TEST_P(FaultInjectionTest, WriteBatchWalTerminationTest) { + ReadOptions ro; + Options options = CurrentOptions(); + options.env = env_; + + WriteOptions wo; + wo.sync = true; + wo.disableWAL = false; + WriteBatch batch; + batch.Put("cats", "dogs"); + batch.MarkWalTerminationPoint(); + batch.Put("boys", "girls"); + ASSERT_OK(db_->Write(wo, &batch)); + + env_->SetFilesystemActive(false); + NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced); + ASSERT_OK(OpenDB()); + + std::string val; + ASSERT_OK(db_->Get(ro, "cats", &val)); + ASSERT_EQ("dogs", val); + ASSERT_EQ(db_->Get(ro, "boys", &val), Status::NotFound()); +} + +INSTANTIATE_TEST_CASE_P( + FaultTest, FaultInjectionTest, + ::testing::Values(std::make_tuple(false, kDefault, kEnd), + std::make_tuple(true, kDefault, kEnd))); + +INSTANTIATE_TEST_CASE_P( + FaultTest, FaultInjectionTestSplitted, + ::testing::Values(std::make_tuple(false, kDefault, kSyncWal), + std::make_tuple(true, kDefault, kSyncWal), + std::make_tuple(false, kSyncWal, kEnd), + std::make_tuple(true, kSyncWal, kEnd))); + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/file_indexer.cc b/src/rocksdb/db/file_indexer.cc new file mode 100644 index 000000000..523cb3c16 --- /dev/null +++ b/src/rocksdb/db/file_indexer.cc @@ -0,0 +1,216 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/file_indexer.h" +#include +#include +#include "db/version_edit.h" +#include "rocksdb/comparator.h" + +namespace ROCKSDB_NAMESPACE { + +FileIndexer::FileIndexer(const Comparator* ucmp) + : num_levels_(0), ucmp_(ucmp), level_rb_(nullptr) {} + +size_t FileIndexer::NumLevelIndex() const { return next_level_index_.size(); } + +size_t FileIndexer::LevelIndexSize(size_t level) const { + if (level >= next_level_index_.size()) { + return 0; + } + return next_level_index_[level].num_index; +} + +void FileIndexer::GetNextLevelIndex(const size_t level, const size_t file_index, + const int cmp_smallest, + const int cmp_largest, int32_t* left_bound, + int32_t* right_bound) const { + assert(level > 0); + + // Last level, no hint + if (level == num_levels_ - 1) { + *left_bound = 0; + *right_bound = -1; + return; + } + + assert(level < num_levels_ - 1); + assert(static_cast(file_index) <= level_rb_[level]); + + const IndexUnit* index_units = next_level_index_[level].index_units; + const auto& index = index_units[file_index]; + + if (cmp_smallest < 0) { + *left_bound = (level > 0 && file_index > 0) + ? index_units[file_index - 1].largest_lb + : 0; + *right_bound = index.smallest_rb; + } else if (cmp_smallest == 0) { + *left_bound = index.smallest_lb; + *right_bound = index.smallest_rb; + } else if (cmp_smallest > 0 && cmp_largest < 0) { + *left_bound = index.smallest_lb; + *right_bound = index.largest_rb; + } else if (cmp_largest == 0) { + *left_bound = index.largest_lb; + *right_bound = index.largest_rb; + } else if (cmp_largest > 0) { + *left_bound = index.largest_lb; + *right_bound = level_rb_[level + 1]; + } else { + assert(false); + } + + assert(*left_bound >= 0); + assert(*left_bound <= *right_bound + 1); + assert(*right_bound <= level_rb_[level + 1]); +} + +void FileIndexer::UpdateIndex(Arena* arena, const size_t num_levels, + std::vector* const files) { + if (files == nullptr) { + return; + } + if (num_levels == 0) { // uint_32 0-1 would cause bad behavior + num_levels_ = num_levels; + return; + } + assert(level_rb_ == nullptr); // level_rb_ should be init here + + num_levels_ = num_levels; + next_level_index_.resize(num_levels); + + char* mem = arena->AllocateAligned(num_levels_ * sizeof(int32_t)); + level_rb_ = new (mem) int32_t[num_levels_]; + for (size_t i = 0; i < num_levels_; i++) { + level_rb_[i] = -1; + } + + // L1 - Ln-1 + for (size_t level = 1; level < num_levels_ - 1; ++level) { + const auto& upper_files = files[level]; + const int32_t upper_size = static_cast(upper_files.size()); + const auto& lower_files = files[level + 1]; + level_rb_[level] = static_cast(upper_files.size()) - 1; + if (upper_size == 0) { + continue; + } + IndexLevel& index_level = next_level_index_[level]; + index_level.num_index = upper_size; + mem = arena->AllocateAligned(upper_size * sizeof(IndexUnit)); + index_level.index_units = new (mem) IndexUnit[upper_size]; + + CalculateLB( + upper_files, lower_files, &index_level, + [this](const FileMetaData* a, const FileMetaData* b) -> int { + return ucmp_->CompareWithoutTimestamp(a->smallest.user_key(), + b->largest.user_key()); + }, + [](IndexUnit* index, int32_t f_idx) { index->smallest_lb = f_idx; }); + CalculateLB( + upper_files, lower_files, &index_level, + [this](const FileMetaData* a, const FileMetaData* b) -> int { + return ucmp_->CompareWithoutTimestamp(a->largest.user_key(), + b->largest.user_key()); + }, + [](IndexUnit* index, int32_t f_idx) { index->largest_lb = f_idx; }); + CalculateRB( + upper_files, lower_files, &index_level, + [this](const FileMetaData* a, const FileMetaData* b) -> int { + return ucmp_->CompareWithoutTimestamp(a->smallest.user_key(), + b->smallest.user_key()); + }, + [](IndexUnit* index, int32_t f_idx) { index->smallest_rb = f_idx; }); + CalculateRB( + upper_files, lower_files, &index_level, + [this](const FileMetaData* a, const FileMetaData* b) -> int { + return ucmp_->CompareWithoutTimestamp(a->largest.user_key(), + b->smallest.user_key()); + }, + [](IndexUnit* index, int32_t f_idx) { index->largest_rb = f_idx; }); + } + + level_rb_[num_levels_ - 1] = + static_cast(files[num_levels_ - 1].size()) - 1; +} + +void FileIndexer::CalculateLB( + const std::vector& upper_files, + const std::vector& lower_files, IndexLevel* index_level, + std::function cmp_op, + std::function set_index) { + const int32_t upper_size = static_cast(upper_files.size()); + const int32_t lower_size = static_cast(lower_files.size()); + int32_t upper_idx = 0; + int32_t lower_idx = 0; + + IndexUnit* index = index_level->index_units; + while (upper_idx < upper_size && lower_idx < lower_size) { + int cmp = cmp_op(upper_files[upper_idx], lower_files[lower_idx]); + + if (cmp == 0) { + set_index(&index[upper_idx], lower_idx); + ++upper_idx; + } else if (cmp > 0) { + // Lower level's file (largest) is smaller, a key won't hit in that + // file. Move to next lower file + ++lower_idx; + } else { + // Lower level's file becomes larger, update the index, and + // move to the next upper file + set_index(&index[upper_idx], lower_idx); + ++upper_idx; + } + } + + while (upper_idx < upper_size) { + // Lower files are exhausted, that means the remaining upper files are + // greater than any lower files. Set the index to be the lower level size. + set_index(&index[upper_idx], lower_size); + ++upper_idx; + } +} + +void FileIndexer::CalculateRB( + const std::vector& upper_files, + const std::vector& lower_files, IndexLevel* index_level, + std::function cmp_op, + std::function set_index) { + const int32_t upper_size = static_cast(upper_files.size()); + const int32_t lower_size = static_cast(lower_files.size()); + int32_t upper_idx = upper_size - 1; + int32_t lower_idx = lower_size - 1; + + IndexUnit* index = index_level->index_units; + while (upper_idx >= 0 && lower_idx >= 0) { + int cmp = cmp_op(upper_files[upper_idx], lower_files[lower_idx]); + + if (cmp == 0) { + set_index(&index[upper_idx], lower_idx); + --upper_idx; + } else if (cmp < 0) { + // Lower level's file (smallest) is larger, a key won't hit in that + // file. Move to next lower file. + --lower_idx; + } else { + // Lower level's file becomes smaller, update the index, and move to + // the next the upper file + set_index(&index[upper_idx], lower_idx); + --upper_idx; + } + } + while (upper_idx >= 0) { + // Lower files are exhausted, that means the remaining upper files are + // smaller than any lower files. Set it to -1. + set_index(&index[upper_idx], -1); + --upper_idx; + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/file_indexer.h b/src/rocksdb/db/file_indexer.h new file mode 100644 index 000000000..ad7553f2c --- /dev/null +++ b/src/rocksdb/db/file_indexer.h @@ -0,0 +1,142 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include +#include +#include "memory/arena.h" +#include "port/port.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +class Comparator; +struct FileMetaData; +struct FdWithKeyRange; +struct FileLevel; + +// The file tree structure in Version is prebuilt and the range of each file +// is known. On Version::Get(), it uses binary search to find a potential file +// and then check if a target key can be found in the file by comparing the key +// to each file's smallest and largest key. The results of these comparisons +// can be reused beyond checking if a key falls into a file's range. +// With some pre-calculated knowledge, each key comparison that has been done +// can serve as a hint to narrow down further searches: if a key compared to +// be smaller than a file's smallest or largest, that comparison can be used +// to find out the right bound of next binary search. Similarly, if a key +// compared to be larger than a file's smallest or largest, it can be utilized +// to find out the left bound of next binary search. +// With these hints: it can greatly reduce the range of binary search, +// especially for bottom levels, given that one file most likely overlaps with +// only N files from level below (where N is max_bytes_for_level_multiplier). +// So on level L, we will only look at ~N files instead of N^L files on the +// naive approach. +class FileIndexer { + public: + explicit FileIndexer(const Comparator* ucmp); + + size_t NumLevelIndex() const; + + size_t LevelIndexSize(size_t level) const; + + // Return a file index range in the next level to search for a key based on + // smallest and largest key comparison for the current file specified by + // level and file_index. When *left_index < *right_index, both index should + // be valid and fit in the vector size. + void GetNextLevelIndex(const size_t level, const size_t file_index, + const int cmp_smallest, const int cmp_largest, + int32_t* left_bound, int32_t* right_bound) const; + + void UpdateIndex(Arena* arena, const size_t num_levels, + std::vector* const files); + + enum { + // MSVC version 1800 still does not have constexpr for ::max() + kLevelMaxIndex = ROCKSDB_NAMESPACE::port::kMaxInt32 + }; + + private: + size_t num_levels_; + const Comparator* ucmp_; + + struct IndexUnit { + IndexUnit() + : smallest_lb(0), largest_lb(0), smallest_rb(-1), largest_rb(-1) {} + // During file search, a key is compared against smallest and largest + // from a FileMetaData. It can have 3 possible outcomes: + // (1) key is smaller than smallest, implying it is also smaller than + // larger. Precalculated index based on "smallest < smallest" can + // be used to provide right bound. + // (2) key is in between smallest and largest. + // Precalculated index based on "smallest > greatest" can be used to + // provide left bound. + // Precalculated index based on "largest < smallest" can be used to + // provide right bound. + // (3) key is larger than largest, implying it is also larger than smallest. + // Precalculated index based on "largest > largest" can be used to + // provide left bound. + // + // As a result, we will need to do: + // Compare smallest (<=) and largest keys from upper level file with + // smallest key from lower level to get a right bound. + // Compare smallest (>=) and largest keys from upper level file with + // largest key from lower level to get a left bound. + // + // Example: + // level 1: [50 - 60] + // level 2: [1 - 40], [45 - 55], [58 - 80] + // A key 35, compared to be less than 50, 3rd file on level 2 can be + // skipped according to rule (1). LB = 0, RB = 1. + // A key 53, sits in the middle 50 and 60. 1st file on level 2 can be + // skipped according to rule (2)-a, but the 3rd file cannot be skipped + // because 60 is greater than 58. LB = 1, RB = 2. + // A key 70, compared to be larger than 60. 1st and 2nd file can be skipped + // according to rule (3). LB = 2, RB = 2. + // + // Point to a left most file in a lower level that may contain a key, + // which compares greater than smallest of a FileMetaData (upper level) + int32_t smallest_lb; + // Point to a left most file in a lower level that may contain a key, + // which compares greater than largest of a FileMetaData (upper level) + int32_t largest_lb; + // Point to a right most file in a lower level that may contain a key, + // which compares smaller than smallest of a FileMetaData (upper level) + int32_t smallest_rb; + // Point to a right most file in a lower level that may contain a key, + // which compares smaller than largest of a FileMetaData (upper level) + int32_t largest_rb; + }; + + // Data structure to store IndexUnits in a whole level + struct IndexLevel { + size_t num_index; + IndexUnit* index_units; + + IndexLevel() : num_index(0), index_units(nullptr) {} + }; + + void CalculateLB( + const std::vector& upper_files, + const std::vector& lower_files, IndexLevel* index_level, + std::function cmp_op, + std::function set_index); + + void CalculateRB( + const std::vector& upper_files, + const std::vector& lower_files, IndexLevel* index_level, + std::function cmp_op, + std::function set_index); + + autovector next_level_index_; + int32_t* level_rb_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/file_indexer_test.cc b/src/rocksdb/db/file_indexer_test.cc new file mode 100644 index 000000000..99ce93993 --- /dev/null +++ b/src/rocksdb/db/file_indexer_test.cc @@ -0,0 +1,350 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/file_indexer.h" +#include +#include "db/dbformat.h" +#include "db/version_edit.h" +#include "port/stack_trace.h" +#include "rocksdb/comparator.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" + +namespace ROCKSDB_NAMESPACE { + +class IntComparator : public Comparator { + public: + int Compare(const Slice& a, const Slice& b) const override { + assert(a.size() == 8); + assert(b.size() == 8); + int64_t diff = *reinterpret_cast(a.data()) - + *reinterpret_cast(b.data()); + if (diff < 0) { + return -1; + } else if (diff == 0) { + return 0; + } else { + return 1; + } + } + + const char* Name() const override { return "IntComparator"; } + + void FindShortestSeparator(std::string* /*start*/, + const Slice& /*limit*/) const override {} + + void FindShortSuccessor(std::string* /*key*/) const override {} +}; + +class FileIndexerTest : public testing::Test { + public: + FileIndexerTest() + : kNumLevels(4), files(new std::vector[kNumLevels]) {} + + ~FileIndexerTest() override { + ClearFiles(); + delete[] files; + } + + void AddFile(int level, int64_t smallest, int64_t largest) { + auto* f = new FileMetaData(); + f->smallest = IntKey(smallest); + f->largest = IntKey(largest); + files[level].push_back(f); + } + + InternalKey IntKey(int64_t v) { + return InternalKey(Slice(reinterpret_cast(&v), 8), 0, kTypeValue); + } + + void ClearFiles() { + for (uint32_t i = 0; i < kNumLevels; ++i) { + for (auto* f : files[i]) { + delete f; + } + files[i].clear(); + } + } + + void GetNextLevelIndex(const uint32_t level, const uint32_t file_index, + const int cmp_smallest, const int cmp_largest, int32_t* left_index, + int32_t* right_index) { + *left_index = 100; + *right_index = 100; + indexer->GetNextLevelIndex(level, file_index, cmp_smallest, cmp_largest, + left_index, right_index); + } + + int32_t left = 100; + int32_t right = 100; + const uint32_t kNumLevels; + IntComparator ucmp; + FileIndexer* indexer; + + std::vector* files; +}; + +// Case 0: Empty +TEST_F(FileIndexerTest, Empty) { + Arena arena; + indexer = new FileIndexer(&ucmp); + indexer->UpdateIndex(&arena, 0, files); + delete indexer; +} + +// Case 1: no overlap, files are on the left of next level files +TEST_F(FileIndexerTest, no_overlap_left) { + Arena arena; + indexer = new FileIndexer(&ucmp); + // level 1 + AddFile(1, 100, 200); + AddFile(1, 300, 400); + AddFile(1, 500, 600); + // level 2 + AddFile(2, 1500, 1600); + AddFile(2, 1601, 1699); + AddFile(2, 1700, 1800); + // level 3 + AddFile(3, 2500, 2600); + AddFile(3, 2601, 2699); + AddFile(3, 2700, 2800); + indexer->UpdateIndex(&arena, kNumLevels, files); + for (uint32_t level = 1; level < 3; ++level) { + for (uint32_t f = 0; f < 3; ++f) { + GetNextLevelIndex(level, f, -1, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + GetNextLevelIndex(level, f, 0, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + GetNextLevelIndex(level, f, 1, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + GetNextLevelIndex(level, f, 1, 0, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + GetNextLevelIndex(level, f, 1, 1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(2, right); + } + } + delete indexer; + ClearFiles(); +} + +// Case 2: no overlap, files are on the right of next level files +TEST_F(FileIndexerTest, no_overlap_right) { + Arena arena; + indexer = new FileIndexer(&ucmp); + // level 1 + AddFile(1, 2100, 2200); + AddFile(1, 2300, 2400); + AddFile(1, 2500, 2600); + // level 2 + AddFile(2, 1500, 1600); + AddFile(2, 1501, 1699); + AddFile(2, 1700, 1800); + // level 3 + AddFile(3, 500, 600); + AddFile(3, 501, 699); + AddFile(3, 700, 800); + indexer->UpdateIndex(&arena, kNumLevels, files); + for (uint32_t level = 1; level < 3; ++level) { + for (uint32_t f = 0; f < 3; ++f) { + GetNextLevelIndex(level, f, -1, -1, &left, &right); + ASSERT_EQ(f == 0 ? 0 : 3, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(level, f, 0, -1, &left, &right); + ASSERT_EQ(3, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(level, f, 1, -1, &left, &right); + ASSERT_EQ(3, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(level, f, 1, -1, &left, &right); + ASSERT_EQ(3, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(level, f, 1, 0, &left, &right); + ASSERT_EQ(3, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(level, f, 1, 1, &left, &right); + ASSERT_EQ(3, left); + ASSERT_EQ(2, right); + } + } + delete indexer; +} + +// Case 3: empty L2 +TEST_F(FileIndexerTest, empty_L2) { + Arena arena; + indexer = new FileIndexer(&ucmp); + for (uint32_t i = 1; i < kNumLevels; ++i) { + ASSERT_EQ(0U, indexer->LevelIndexSize(i)); + } + // level 1 + AddFile(1, 2100, 2200); + AddFile(1, 2300, 2400); + AddFile(1, 2500, 2600); + // level 3 + AddFile(3, 500, 600); + AddFile(3, 501, 699); + AddFile(3, 700, 800); + indexer->UpdateIndex(&arena, kNumLevels, files); + for (uint32_t f = 0; f < 3; ++f) { + GetNextLevelIndex(1, f, -1, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + GetNextLevelIndex(1, f, 0, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + GetNextLevelIndex(1, f, 1, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + GetNextLevelIndex(1, f, 1, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + GetNextLevelIndex(1, f, 1, 0, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + GetNextLevelIndex(1, f, 1, 1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(-1, right); + } + delete indexer; + ClearFiles(); +} + +// Case 4: mixed +TEST_F(FileIndexerTest, mixed) { + Arena arena; + indexer = new FileIndexer(&ucmp); + // level 1 + AddFile(1, 100, 200); + AddFile(1, 250, 400); + AddFile(1, 450, 500); + // level 2 + AddFile(2, 100, 150); // 0 + AddFile(2, 200, 250); // 1 + AddFile(2, 251, 300); // 2 + AddFile(2, 301, 350); // 3 + AddFile(2, 500, 600); // 4 + // level 3 + AddFile(3, 0, 50); + AddFile(3, 100, 200); + AddFile(3, 201, 250); + indexer->UpdateIndex(&arena, kNumLevels, files); + // level 1, 0 + GetNextLevelIndex(1, 0, -1, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(0, right); + GetNextLevelIndex(1, 0, 0, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(0, right); + GetNextLevelIndex(1, 0, 1, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(1, 0, 1, 0, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(1, 0, 1, 1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(4, right); + // level 1, 1 + GetNextLevelIndex(1, 1, -1, -1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(1, 1, 0, -1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(1, 1, 1, -1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(3, right); + GetNextLevelIndex(1, 1, 1, 0, &left, &right); + ASSERT_EQ(4, left); + ASSERT_EQ(3, right); + GetNextLevelIndex(1, 1, 1, 1, &left, &right); + ASSERT_EQ(4, left); + ASSERT_EQ(4, right); + // level 1, 2 + GetNextLevelIndex(1, 2, -1, -1, &left, &right); + ASSERT_EQ(4, left); + ASSERT_EQ(3, right); + GetNextLevelIndex(1, 2, 0, -1, &left, &right); + ASSERT_EQ(4, left); + ASSERT_EQ(3, right); + GetNextLevelIndex(1, 2, 1, -1, &left, &right); + ASSERT_EQ(4, left); + ASSERT_EQ(4, right); + GetNextLevelIndex(1, 2, 1, 0, &left, &right); + ASSERT_EQ(4, left); + ASSERT_EQ(4, right); + GetNextLevelIndex(1, 2, 1, 1, &left, &right); + ASSERT_EQ(4, left); + ASSERT_EQ(4, right); + // level 2, 0 + GetNextLevelIndex(2, 0, -1, -1, &left, &right); + ASSERT_EQ(0, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(2, 0, 0, -1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(2, 0, 1, -1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(2, 0, 1, 0, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(2, 0, 1, 1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(2, right); + // level 2, 1 + GetNextLevelIndex(2, 1, -1, -1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(2, 1, 0, -1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(1, right); + GetNextLevelIndex(2, 1, 1, -1, &left, &right); + ASSERT_EQ(1, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(2, 1, 1, 0, &left, &right); + ASSERT_EQ(2, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(2, 1, 1, 1, &left, &right); + ASSERT_EQ(2, left); + ASSERT_EQ(2, right); + // level 2, [2 - 4], no overlap + for (uint32_t f = 2; f <= 4; ++f) { + GetNextLevelIndex(2, f, -1, -1, &left, &right); + ASSERT_EQ(f == 2 ? 2 : 3, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(2, f, 0, -1, &left, &right); + ASSERT_EQ(3, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(2, f, 1, -1, &left, &right); + ASSERT_EQ(3, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(2, f, 1, 0, &left, &right); + ASSERT_EQ(3, left); + ASSERT_EQ(2, right); + GetNextLevelIndex(2, f, 1, 1, &left, &right); + ASSERT_EQ(3, left); + ASSERT_EQ(2, right); + } + delete indexer; + ClearFiles(); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/filename_test.cc b/src/rocksdb/db/filename_test.cc new file mode 100644 index 000000000..9a04542f6 --- /dev/null +++ b/src/rocksdb/db/filename_test.cc @@ -0,0 +1,180 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "file/filename.h" + +#include "db/dbformat.h" +#include "logging/logging.h" +#include "port/port.h" +#include "test_util/testharness.h" + +namespace ROCKSDB_NAMESPACE { + +class FileNameTest : public testing::Test {}; + +TEST_F(FileNameTest, Parse) { + Slice db; + FileType type; + uint64_t number; + + char kDefautInfoLogDir = 1; + char kDifferentInfoLogDir = 2; + char kNoCheckLogDir = 4; + char kAllMode = kDefautInfoLogDir | kDifferentInfoLogDir | kNoCheckLogDir; + + // Successful parses + static struct { + const char* fname; + uint64_t number; + FileType type; + char mode; + } cases[] = { + {"100.log", 100, kLogFile, kAllMode}, + {"0.log", 0, kLogFile, kAllMode}, + {"0.sst", 0, kTableFile, kAllMode}, + {"CURRENT", 0, kCurrentFile, kAllMode}, + {"LOCK", 0, kDBLockFile, kAllMode}, + {"MANIFEST-2", 2, kDescriptorFile, kAllMode}, + {"MANIFEST-7", 7, kDescriptorFile, kAllMode}, + {"METADB-2", 2, kMetaDatabase, kAllMode}, + {"METADB-7", 7, kMetaDatabase, kAllMode}, + {"LOG", 0, kInfoLogFile, kDefautInfoLogDir}, + {"LOG.old", 0, kInfoLogFile, kDefautInfoLogDir}, + {"LOG.old.6688", 6688, kInfoLogFile, kDefautInfoLogDir}, + {"rocksdb_dir_LOG", 0, kInfoLogFile, kDifferentInfoLogDir}, + {"rocksdb_dir_LOG.old", 0, kInfoLogFile, kDifferentInfoLogDir}, + {"rocksdb_dir_LOG.old.6688", 6688, kInfoLogFile, kDifferentInfoLogDir}, + {"18446744073709551615.log", 18446744073709551615ull, kLogFile, + kAllMode}, }; + for (char mode : {kDifferentInfoLogDir, kDefautInfoLogDir, kNoCheckLogDir}) { + for (unsigned int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) { + InfoLogPrefix info_log_prefix(mode != kDefautInfoLogDir, "/rocksdb/dir"); + if (cases[i].mode & mode) { + std::string f = cases[i].fname; + if (mode == kNoCheckLogDir) { + ASSERT_TRUE(ParseFileName(f, &number, &type)) << f; + } else { + ASSERT_TRUE(ParseFileName(f, &number, info_log_prefix.prefix, &type)) + << f; + } + ASSERT_EQ(cases[i].type, type) << f; + ASSERT_EQ(cases[i].number, number) << f; + } + } + } + + // Errors + static const char* errors[] = { + "", + "foo", + "foo-dx-100.log", + ".log", + "", + "manifest", + "CURREN", + "CURRENTX", + "MANIFES", + "MANIFEST", + "MANIFEST-", + "XMANIFEST-3", + "MANIFEST-3x", + "META", + "METADB", + "METADB-", + "XMETADB-3", + "METADB-3x", + "LOC", + "LOCKx", + "LO", + "LOGx", + "18446744073709551616.log", + "184467440737095516150.log", + "100", + "100.", + "100.lop" + }; + for (unsigned int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) { + std::string f = errors[i]; + ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f; + }; +} + +TEST_F(FileNameTest, InfoLogFileName) { + std::string dbname = ("/data/rocksdb"); + std::string db_absolute_path; + Env::Default()->GetAbsolutePath(dbname, &db_absolute_path); + + ASSERT_EQ("/data/rocksdb/LOG", InfoLogFileName(dbname, db_absolute_path, "")); + ASSERT_EQ("/data/rocksdb/LOG.old.666", + OldInfoLogFileName(dbname, 666u, db_absolute_path, "")); + + ASSERT_EQ("/data/rocksdb_log/data_rocksdb_LOG", + InfoLogFileName(dbname, db_absolute_path, "/data/rocksdb_log")); + ASSERT_EQ( + "/data/rocksdb_log/data_rocksdb_LOG.old.666", + OldInfoLogFileName(dbname, 666u, db_absolute_path, "/data/rocksdb_log")); +} + +TEST_F(FileNameTest, Construction) { + uint64_t number; + FileType type; + std::string fname; + + fname = CurrentFileName("foo"); + ASSERT_EQ("foo/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(0U, number); + ASSERT_EQ(kCurrentFile, type); + + fname = LockFileName("foo"); + ASSERT_EQ("foo/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(0U, number); + ASSERT_EQ(kDBLockFile, type); + + fname = LogFileName("foo", 192); + ASSERT_EQ("foo/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(192U, number); + ASSERT_EQ(kLogFile, type); + + fname = TableFileName({DbPath("bar", 0)}, 200, 0); + std::string fname1 = + TableFileName({DbPath("foo", 0), DbPath("bar", 0)}, 200, 1); + ASSERT_EQ(fname, fname1); + ASSERT_EQ("bar/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(200U, number); + ASSERT_EQ(kTableFile, type); + + fname = DescriptorFileName("bar", 100); + ASSERT_EQ("bar/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(100U, number); + ASSERT_EQ(kDescriptorFile, type); + + fname = TempFileName("tmp", 999); + ASSERT_EQ("tmp/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(999U, number); + ASSERT_EQ(kTempFile, type); + + fname = MetaDatabaseName("met", 100); + ASSERT_EQ("met/", std::string(fname.data(), 4)); + ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); + ASSERT_EQ(100U, number); + ASSERT_EQ(kMetaDatabase, type); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/flush_job.cc b/src/rocksdb/db/flush_job.cc new file mode 100644 index 000000000..997bd8080 --- /dev/null +++ b/src/rocksdb/db/flush_job.cc @@ -0,0 +1,466 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/flush_job.h" + +#include + +#include +#include + +#include "db/builder.h" +#include "db/db_iter.h" +#include "db/dbformat.h" +#include "db/event_helpers.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/memtable_list.h" +#include "db/merge_context.h" +#include "db/range_tombstone_fragmenter.h" +#include "db/version_set.h" +#include "file/file_util.h" +#include "file/filename.h" +#include "logging/event_logger.h" +#include "logging/log_buffer.h" +#include "logging/logging.h" +#include "monitoring/iostats_context_imp.h" +#include "monitoring/perf_context_imp.h" +#include "monitoring/thread_status_util.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/merging_iterator.h" +#include "table/table_builder.h" +#include "table/two_level_iterator.h" +#include "test_util/sync_point.h" +#include "util/coding.h" +#include "util/mutexlock.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { + +const char* GetFlushReasonString (FlushReason flush_reason) { + switch (flush_reason) { + case FlushReason::kOthers: + return "Other Reasons"; + case FlushReason::kGetLiveFiles: + return "Get Live Files"; + case FlushReason::kShutDown: + return "Shut down"; + case FlushReason::kExternalFileIngestion: + return "External File Ingestion"; + case FlushReason::kManualCompaction: + return "Manual Compaction"; + case FlushReason::kWriteBufferManager: + return "Write Buffer Manager"; + case FlushReason::kWriteBufferFull: + return "Write Buffer Full"; + case FlushReason::kTest: + return "Test"; + case FlushReason::kDeleteFiles: + return "Delete Files"; + case FlushReason::kAutoCompaction: + return "Auto Compaction"; + case FlushReason::kManualFlush: + return "Manual Flush"; + case FlushReason::kErrorRecovery: + return "Error Recovery"; + default: + return "Invalid"; + } +} + +FlushJob::FlushJob(const std::string& dbname, ColumnFamilyData* cfd, + const ImmutableDBOptions& db_options, + const MutableCFOptions& mutable_cf_options, + const uint64_t* max_memtable_id, + const FileOptions& file_options, VersionSet* versions, + InstrumentedMutex* db_mutex, + std::atomic* shutting_down, + std::vector existing_snapshots, + SequenceNumber earliest_write_conflict_snapshot, + SnapshotChecker* snapshot_checker, JobContext* job_context, + LogBuffer* log_buffer, Directory* db_directory, + Directory* output_file_directory, + CompressionType output_compression, Statistics* stats, + EventLogger* event_logger, bool measure_io_stats, + const bool sync_output_directory, const bool write_manifest, + Env::Priority thread_pri) + : dbname_(dbname), + cfd_(cfd), + db_options_(db_options), + mutable_cf_options_(mutable_cf_options), + max_memtable_id_(max_memtable_id), + file_options_(file_options), + versions_(versions), + db_mutex_(db_mutex), + shutting_down_(shutting_down), + existing_snapshots_(std::move(existing_snapshots)), + earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot), + snapshot_checker_(snapshot_checker), + job_context_(job_context), + log_buffer_(log_buffer), + db_directory_(db_directory), + output_file_directory_(output_file_directory), + output_compression_(output_compression), + stats_(stats), + event_logger_(event_logger), + measure_io_stats_(measure_io_stats), + sync_output_directory_(sync_output_directory), + write_manifest_(write_manifest), + edit_(nullptr), + base_(nullptr), + pick_memtable_called(false), + thread_pri_(thread_pri) { + // Update the thread status to indicate flush. + ReportStartedFlush(); + TEST_SYNC_POINT("FlushJob::FlushJob()"); +} + +FlushJob::~FlushJob() { + ThreadStatusUtil::ResetThreadStatus(); +} + +void FlushJob::ReportStartedFlush() { + ThreadStatusUtil::SetColumnFamily(cfd_, cfd_->ioptions()->env, + db_options_.enable_thread_tracking); + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_FLUSH); + ThreadStatusUtil::SetThreadOperationProperty( + ThreadStatus::COMPACTION_JOB_ID, + job_context_->job_id); + IOSTATS_RESET(bytes_written); +} + +void FlushJob::ReportFlushInputSize(const autovector& mems) { + uint64_t input_size = 0; + for (auto* mem : mems) { + input_size += mem->ApproximateMemoryUsage(); + } + ThreadStatusUtil::IncreaseThreadOperationProperty( + ThreadStatus::FLUSH_BYTES_MEMTABLES, + input_size); +} + +void FlushJob::RecordFlushIOStats() { + RecordTick(stats_, FLUSH_WRITE_BYTES, IOSTATS(bytes_written)); + ThreadStatusUtil::IncreaseThreadOperationProperty( + ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written)); + IOSTATS_RESET(bytes_written); +} + +void FlushJob::PickMemTable() { + db_mutex_->AssertHeld(); + assert(!pick_memtable_called); + pick_memtable_called = true; + // Save the contents of the earliest memtable as a new Table + cfd_->imm()->PickMemtablesToFlush(max_memtable_id_, &mems_); + if (mems_.empty()) { + return; + } + + ReportFlushInputSize(mems_); + + // entries mems are (implicitly) sorted in ascending order by their created + // time. We will use the first memtable's `edit` to keep the meta info for + // this flush. + MemTable* m = mems_[0]; + edit_ = m->GetEdits(); + edit_->SetPrevLogNumber(0); + // SetLogNumber(log_num) indicates logs with number smaller than log_num + // will no longer be picked up for recovery. + edit_->SetLogNumber(mems_.back()->GetNextLogNumber()); + edit_->SetColumnFamily(cfd_->GetID()); + + // path 0 for level 0 file. + meta_.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0); + + base_ = cfd_->current(); + base_->Ref(); // it is likely that we do not need this reference +} + +Status FlushJob::Run(LogsWithPrepTracker* prep_tracker, + FileMetaData* file_meta) { + TEST_SYNC_POINT("FlushJob::Start"); + db_mutex_->AssertHeld(); + assert(pick_memtable_called); + AutoThreadOperationStageUpdater stage_run( + ThreadStatus::STAGE_FLUSH_RUN); + if (mems_.empty()) { + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Nothing in memtable to flush", + cfd_->GetName().c_str()); + return Status::OK(); + } + + // I/O measurement variables + PerfLevel prev_perf_level = PerfLevel::kEnableTime; + uint64_t prev_write_nanos = 0; + uint64_t prev_fsync_nanos = 0; + uint64_t prev_range_sync_nanos = 0; + uint64_t prev_prepare_write_nanos = 0; + uint64_t prev_cpu_write_nanos = 0; + uint64_t prev_cpu_read_nanos = 0; + if (measure_io_stats_) { + prev_perf_level = GetPerfLevel(); + SetPerfLevel(PerfLevel::kEnableTime); + prev_write_nanos = IOSTATS(write_nanos); + prev_fsync_nanos = IOSTATS(fsync_nanos); + prev_range_sync_nanos = IOSTATS(range_sync_nanos); + prev_prepare_write_nanos = IOSTATS(prepare_write_nanos); + prev_cpu_write_nanos = IOSTATS(cpu_write_nanos); + prev_cpu_read_nanos = IOSTATS(cpu_read_nanos); + } + + // This will release and re-acquire the mutex. + Status s = WriteLevel0Table(); + + if (s.ok() && cfd_->IsDropped()) { + s = Status::ColumnFamilyDropped("Column family dropped during compaction"); + } + if ((s.ok() || s.IsColumnFamilyDropped()) && + shutting_down_->load(std::memory_order_acquire)) { + s = Status::ShutdownInProgress("Database shutdown"); + } + + if (!s.ok()) { + cfd_->imm()->RollbackMemtableFlush(mems_, meta_.fd.GetNumber()); + } else if (write_manifest_) { + TEST_SYNC_POINT("FlushJob::InstallResults"); + // Replace immutable memtable with the generated Table + s = cfd_->imm()->TryInstallMemtableFlushResults( + cfd_, mutable_cf_options_, mems_, prep_tracker, versions_, db_mutex_, + meta_.fd.GetNumber(), &job_context_->memtables_to_free, db_directory_, + log_buffer_, &committed_flush_jobs_info_); + } + + if (s.ok() && file_meta != nullptr) { + *file_meta = meta_; + } + RecordFlushIOStats(); + + // When measure_io_stats_ is true, the default 512 bytes is not enough. + auto stream = event_logger_->LogToBuffer(log_buffer_, 1024); + stream << "job" << job_context_->job_id << "event" + << "flush_finished"; + stream << "output_compression" + << CompressionTypeToString(output_compression_); + stream << "lsm_state"; + stream.StartArray(); + auto vstorage = cfd_->current()->storage_info(); + for (int level = 0; level < vstorage->num_levels(); ++level) { + stream << vstorage->NumLevelFiles(level); + } + stream.EndArray(); + stream << "immutable_memtables" << cfd_->imm()->NumNotFlushed(); + + if (measure_io_stats_) { + if (prev_perf_level != PerfLevel::kEnableTime) { + SetPerfLevel(prev_perf_level); + } + stream << "file_write_nanos" << (IOSTATS(write_nanos) - prev_write_nanos); + stream << "file_range_sync_nanos" + << (IOSTATS(range_sync_nanos) - prev_range_sync_nanos); + stream << "file_fsync_nanos" << (IOSTATS(fsync_nanos) - prev_fsync_nanos); + stream << "file_prepare_write_nanos" + << (IOSTATS(prepare_write_nanos) - prev_prepare_write_nanos); + stream << "file_cpu_write_nanos" + << (IOSTATS(cpu_write_nanos) - prev_cpu_write_nanos); + stream << "file_cpu_read_nanos" + << (IOSTATS(cpu_read_nanos) - prev_cpu_read_nanos); + } + + return s; +} + +void FlushJob::Cancel() { + db_mutex_->AssertHeld(); + assert(base_ != nullptr); + base_->Unref(); +} + +Status FlushJob::WriteLevel0Table() { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_FLUSH_WRITE_L0); + db_mutex_->AssertHeld(); + const uint64_t start_micros = db_options_.env->NowMicros(); + const uint64_t start_cpu_micros = db_options_.env->NowCPUNanos() / 1000; + Status s; + { + auto write_hint = cfd_->CalculateSSTWriteHint(0); + db_mutex_->Unlock(); + if (log_buffer_) { + log_buffer_->FlushBufferToLog(); + } + // memtables and range_del_iters store internal iterators over each data + // memtable and its associated range deletion memtable, respectively, at + // corresponding indexes. + std::vector memtables; + std::vector> + range_del_iters; + ReadOptions ro; + ro.total_order_seek = true; + Arena arena; + uint64_t total_num_entries = 0, total_num_deletes = 0; + uint64_t total_data_size = 0; + size_t total_memory_usage = 0; + for (MemTable* m : mems_) { + ROCKS_LOG_INFO( + db_options_.info_log, + "[%s] [JOB %d] Flushing memtable with next log file: %" PRIu64 "\n", + cfd_->GetName().c_str(), job_context_->job_id, m->GetNextLogNumber()); + memtables.push_back(m->NewIterator(ro, &arena)); + auto* range_del_iter = + m->NewRangeTombstoneIterator(ro, kMaxSequenceNumber); + if (range_del_iter != nullptr) { + range_del_iters.emplace_back(range_del_iter); + } + total_num_entries += m->num_entries(); + total_num_deletes += m->num_deletes(); + total_data_size += m->get_data_size(); + total_memory_usage += m->ApproximateMemoryUsage(); + } + + event_logger_->Log() << "job" << job_context_->job_id << "event" + << "flush_started" + << "num_memtables" << mems_.size() << "num_entries" + << total_num_entries << "num_deletes" + << total_num_deletes << "total_data_size" + << total_data_size << "memory_usage" + << total_memory_usage << "flush_reason" + << GetFlushReasonString(cfd_->GetFlushReason()); + + { + ScopedArenaIterator iter( + NewMergingIterator(&cfd_->internal_comparator(), &memtables[0], + static_cast(memtables.size()), &arena)); + ROCKS_LOG_INFO(db_options_.info_log, + "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": started", + cfd_->GetName().c_str(), job_context_->job_id, + meta_.fd.GetNumber()); + + TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table:output_compression", + &output_compression_); + int64_t _current_time = 0; + auto status = db_options_.env->GetCurrentTime(&_current_time); + // Safe to proceed even if GetCurrentTime fails. So, log and proceed. + if (!status.ok()) { + ROCKS_LOG_WARN( + db_options_.info_log, + "Failed to get current time to populate creation_time property. " + "Status: %s", + status.ToString().c_str()); + } + const uint64_t current_time = static_cast(_current_time); + + uint64_t oldest_key_time = + mems_.front()->ApproximateOldestKeyTime(); + + // It's not clear whether oldest_key_time is always available. In case + // it is not available, use current_time. + meta_.oldest_ancester_time = std::min(current_time, oldest_key_time); + meta_.file_creation_time = current_time; + + uint64_t creation_time = (cfd_->ioptions()->compaction_style == + CompactionStyle::kCompactionStyleFIFO) + ? current_time + : meta_.oldest_ancester_time; + + s = BuildTable( + dbname_, db_options_.env, db_options_.fs.get(), *cfd_->ioptions(), + mutable_cf_options_, file_options_, cfd_->table_cache(), iter.get(), + std::move(range_del_iters), &meta_, cfd_->internal_comparator(), + cfd_->int_tbl_prop_collector_factories(), cfd_->GetID(), + cfd_->GetName(), existing_snapshots_, + earliest_write_conflict_snapshot_, snapshot_checker_, + output_compression_, mutable_cf_options_.sample_for_compression, + cfd_->ioptions()->compression_opts, + mutable_cf_options_.paranoid_file_checks, cfd_->internal_stats(), + TableFileCreationReason::kFlush, event_logger_, job_context_->job_id, + Env::IO_HIGH, &table_properties_, 0 /* level */, + creation_time, oldest_key_time, write_hint, current_time); + LogFlush(db_options_.info_log); + } + ROCKS_LOG_INFO(db_options_.info_log, + "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": %" PRIu64 + " bytes %s" + "%s", + cfd_->GetName().c_str(), job_context_->job_id, + meta_.fd.GetNumber(), meta_.fd.GetFileSize(), + s.ToString().c_str(), + meta_.marked_for_compaction ? " (needs compaction)" : ""); + + if (s.ok() && output_file_directory_ != nullptr && sync_output_directory_) { + s = output_file_directory_->Fsync(); + } + TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table", &mems_); + db_mutex_->Lock(); + } + base_->Unref(); + + // Note that if file_size is zero, the file has been deleted and + // should not be added to the manifest. + if (s.ok() && meta_.fd.GetFileSize() > 0) { + // if we have more than 1 background thread, then we cannot + // insert files directly into higher levels because some other + // threads could be concurrently producing compacted files for + // that key range. + // Add file to L0 + edit_->AddFile(0 /* level */, meta_.fd.GetNumber(), meta_.fd.GetPathId(), + meta_.fd.GetFileSize(), meta_.smallest, meta_.largest, + meta_.fd.smallest_seqno, meta_.fd.largest_seqno, + meta_.marked_for_compaction, meta_.oldest_blob_file_number, + meta_.oldest_ancester_time, meta_.file_creation_time, + meta_.file_checksum, meta_.file_checksum_func_name); + } +#ifndef ROCKSDB_LITE + // Piggyback FlushJobInfo on the first first flushed memtable. + mems_[0]->SetFlushJobInfo(GetFlushJobInfo()); +#endif // !ROCKSDB_LITE + + // Note that here we treat flush as level 0 compaction in internal stats + InternalStats::CompactionStats stats(CompactionReason::kFlush, 1); + stats.micros = db_options_.env->NowMicros() - start_micros; + stats.cpu_micros = db_options_.env->NowCPUNanos() / 1000 - start_cpu_micros; + stats.bytes_written = meta_.fd.GetFileSize(); + RecordTimeToHistogram(stats_, FLUSH_TIME, stats.micros); + cfd_->internal_stats()->AddCompactionStats(0 /* level */, thread_pri_, stats); + cfd_->internal_stats()->AddCFStats(InternalStats::BYTES_FLUSHED, + meta_.fd.GetFileSize()); + RecordFlushIOStats(); + return s; +} + +#ifndef ROCKSDB_LITE +std::unique_ptr FlushJob::GetFlushJobInfo() const { + db_mutex_->AssertHeld(); + std::unique_ptr info(new FlushJobInfo{}); + info->cf_id = cfd_->GetID(); + info->cf_name = cfd_->GetName(); + + const uint64_t file_number = meta_.fd.GetNumber(); + info->file_path = + MakeTableFileName(cfd_->ioptions()->cf_paths[0].path, file_number); + info->file_number = file_number; + info->oldest_blob_file_number = meta_.oldest_blob_file_number; + info->thread_id = db_options_.env->GetThreadID(); + info->job_id = job_context_->job_id; + info->smallest_seqno = meta_.fd.smallest_seqno; + info->largest_seqno = meta_.fd.largest_seqno; + info->table_properties = table_properties_; + info->flush_reason = cfd_->GetFlushReason(); + return info; +} +#endif // !ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/flush_job.h b/src/rocksdb/db/flush_job.h new file mode 100644 index 000000000..1f4435f4c --- /dev/null +++ b/src/rocksdb/db/flush_job.h @@ -0,0 +1,158 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/column_family.h" +#include "db/dbformat.h" +#include "db/flush_scheduler.h" +#include "db/internal_stats.h" +#include "db/job_context.h" +#include "db/log_writer.h" +#include "db/logs_with_prep_tracker.h" +#include "db/memtable_list.h" +#include "db/snapshot_impl.h" +#include "db/version_edit.h" +#include "db/write_controller.h" +#include "db/write_thread.h" +#include "logging/event_logger.h" +#include "monitoring/instrumented_mutex.h" +#include "options/db_options.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/listener.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/transaction_log.h" +#include "table/scoped_arena_iterator.h" +#include "util/autovector.h" +#include "util/stop_watch.h" +#include "util/thread_local.h" + +namespace ROCKSDB_NAMESPACE { + +class DBImpl; +class MemTable; +class SnapshotChecker; +class TableCache; +class Version; +class VersionEdit; +class VersionSet; +class Arena; + +class FlushJob { + public: + // TODO(icanadi) make effort to reduce number of parameters here + // IMPORTANT: mutable_cf_options needs to be alive while FlushJob is alive + FlushJob(const std::string& dbname, ColumnFamilyData* cfd, + const ImmutableDBOptions& db_options, + const MutableCFOptions& mutable_cf_options, + const uint64_t* max_memtable_id, const FileOptions& file_options, + VersionSet* versions, InstrumentedMutex* db_mutex, + std::atomic* shutting_down, + std::vector existing_snapshots, + SequenceNumber earliest_write_conflict_snapshot, + SnapshotChecker* snapshot_checker, JobContext* job_context, + LogBuffer* log_buffer, Directory* db_directory, + Directory* output_file_directory, CompressionType output_compression, + Statistics* stats, EventLogger* event_logger, bool measure_io_stats, + const bool sync_output_directory, const bool write_manifest, + Env::Priority thread_pri); + + ~FlushJob(); + + // Require db_mutex held. + // Once PickMemTable() is called, either Run() or Cancel() has to be called. + void PickMemTable(); + Status Run(LogsWithPrepTracker* prep_tracker = nullptr, + FileMetaData* file_meta = nullptr); + void Cancel(); + const autovector& GetMemTables() const { return mems_; } + +#ifndef ROCKSDB_LITE + std::list>* GetCommittedFlushJobsInfo() { + return &committed_flush_jobs_info_; + } +#endif // !ROCKSDB_LITE + + private: + void ReportStartedFlush(); + void ReportFlushInputSize(const autovector& mems); + void RecordFlushIOStats(); + Status WriteLevel0Table(); +#ifndef ROCKSDB_LITE + std::unique_ptr GetFlushJobInfo() const; +#endif // !ROCKSDB_LITE + + const std::string& dbname_; + ColumnFamilyData* cfd_; + const ImmutableDBOptions& db_options_; + const MutableCFOptions& mutable_cf_options_; + // Pointer to a variable storing the largest memtable id to flush in this + // flush job. RocksDB uses this variable to select the memtables to flush in + // this job. All memtables in this column family with an ID smaller than or + // equal to *max_memtable_id_ will be selected for flush. If null, then all + // memtables in the column family will be selected. + const uint64_t* max_memtable_id_; + const FileOptions file_options_; + VersionSet* versions_; + InstrumentedMutex* db_mutex_; + std::atomic* shutting_down_; + std::vector existing_snapshots_; + SequenceNumber earliest_write_conflict_snapshot_; + SnapshotChecker* snapshot_checker_; + JobContext* job_context_; + LogBuffer* log_buffer_; + Directory* db_directory_; + Directory* output_file_directory_; + CompressionType output_compression_; + Statistics* stats_; + EventLogger* event_logger_; + TableProperties table_properties_; + bool measure_io_stats_; + // True if this flush job should call fsync on the output directory. False + // otherwise. + // Usually sync_output_directory_ is true. A flush job needs to call sync on + // the output directory before committing to the MANIFEST. + // However, an individual flush job does not have to call sync on the output + // directory if it is part of an atomic flush. After all flush jobs in the + // atomic flush succeed, call sync once on each distinct output directory. + const bool sync_output_directory_; + // True if this flush job should write to MANIFEST after successfully + // flushing memtables. False otherwise. + // Usually write_manifest_ is true. A flush job commits to the MANIFEST after + // flushing the memtables. + // However, an individual flush job cannot rashly write to the MANIFEST + // immediately after it finishes the flush if it is part of an atomic flush. + // In this case, only after all flush jobs succeed in flush can RocksDB + // commit to the MANIFEST. + const bool write_manifest_; + // The current flush job can commit flush result of a concurrent flush job. + // We collect FlushJobInfo of all jobs committed by current job and fire + // OnFlushCompleted for them. + std::list> committed_flush_jobs_info_; + + // Variables below are set by PickMemTable(): + FileMetaData meta_; + autovector mems_; + VersionEdit* edit_; + Version* base_; + bool pick_memtable_called; + Env::Priority thread_pri_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/flush_job_test.cc b/src/rocksdb/db/flush_job_test.cc new file mode 100644 index 000000000..b77a4a2a9 --- /dev/null +++ b/src/rocksdb/db/flush_job_test.cc @@ -0,0 +1,498 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include +#include +#include +#include + +#include "db/blob_index.h" +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "db/flush_job.h" +#include "db/version_set.h" +#include "file/writable_file_writer.h" +#include "rocksdb/cache.h" +#include "rocksdb/write_buffer_manager.h" +#include "table/mock_table.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +// TODO(icanadi) Mock out everything else: +// 1. VersionSet +// 2. Memtable +class FlushJobTest : public testing::Test { + public: + FlushJobTest() + : env_(Env::Default()), + fs_(std::make_shared(env_)), + dbname_(test::PerThreadDBPath("flush_job_test")), + options_(), + db_options_(options_), + column_family_names_({kDefaultColumnFamilyName, "foo", "bar"}), + table_cache_(NewLRUCache(50000, 16)), + write_buffer_manager_(db_options_.db_write_buffer_size), + shutting_down_(false), + mock_table_factory_(new mock::MockTableFactory()) { + EXPECT_OK(env_->CreateDirIfMissing(dbname_)); + db_options_.db_paths.emplace_back(dbname_, + std::numeric_limits::max()); + db_options_.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + // TODO(icanadi) Remove this once we mock out VersionSet + NewDB(); + std::vector column_families; + cf_options_.table_factory = mock_table_factory_; + for (const auto& cf_name : column_family_names_) { + column_families.emplace_back(cf_name, cf_options_); + } + + db_options_.env = env_; + db_options_.fs = fs_; + versions_.reset(new VersionSet(dbname_, &db_options_, env_options_, + table_cache_.get(), &write_buffer_manager_, + &write_controller_, + /*block_cache_tracer=*/nullptr)); + EXPECT_OK(versions_->Recover(column_families, false)); + } + + void NewDB() { + SetIdentityFile(env_, dbname_); + VersionEdit new_db; + if (db_options_.write_dbid_to_manifest) { + DBImpl* impl = new DBImpl(DBOptions(), dbname_); + std::string db_id; + impl->GetDbIdentityFromIdentityFile(&db_id); + new_db.SetDBId(db_id); + } + new_db.SetLogNumber(0); + new_db.SetNextFile(2); + new_db.SetLastSequence(0); + + autovector new_cfs; + SequenceNumber last_seq = 1; + uint32_t cf_id = 1; + for (size_t i = 1; i != column_family_names_.size(); ++i) { + VersionEdit new_cf; + new_cf.AddColumnFamily(column_family_names_[i]); + new_cf.SetColumnFamily(cf_id++); + new_cf.SetLogNumber(0); + new_cf.SetNextFile(2); + new_cf.SetLastSequence(last_seq++); + new_cfs.emplace_back(new_cf); + } + + const std::string manifest = DescriptorFileName(dbname_, 1); + std::unique_ptr file; + Status s = env_->NewWritableFile( + manifest, &file, env_->OptimizeForManifestWrite(env_options_)); + ASSERT_OK(s); + std::unique_ptr file_writer(new WritableFileWriter( + NewLegacyWritableFileWrapper(std::move(file)), manifest, EnvOptions())); + { + log::Writer log(std::move(file_writer), 0, false); + std::string record; + new_db.EncodeTo(&record); + s = log.AddRecord(record); + + for (const auto& e : new_cfs) { + record.clear(); + e.EncodeTo(&record); + s = log.AddRecord(record); + ASSERT_OK(s); + } + } + ASSERT_OK(s); + // Make "CURRENT" file that points to the new manifest file. + s = SetCurrentFile(env_, dbname_, 1, nullptr); + } + + Env* env_; + std::shared_ptr fs_; + std::string dbname_; + EnvOptions env_options_; + Options options_; + ImmutableDBOptions db_options_; + const std::vector column_family_names_; + std::shared_ptr table_cache_; + WriteController write_controller_; + WriteBufferManager write_buffer_manager_; + ColumnFamilyOptions cf_options_; + std::unique_ptr versions_; + InstrumentedMutex mutex_; + std::atomic shutting_down_; + std::shared_ptr mock_table_factory_; +}; + +TEST_F(FlushJobTest, Empty) { + JobContext job_context(0); + auto cfd = versions_->GetColumnFamilySet()->GetDefault(); + EventLogger event_logger(db_options_.info_log.get()); + SnapshotChecker* snapshot_checker = nullptr; // not relavant + FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(), + db_options_, *cfd->GetLatestMutableCFOptions(), + nullptr /* memtable_id */, env_options_, versions_.get(), + &mutex_, &shutting_down_, {}, kMaxSequenceNumber, + snapshot_checker, &job_context, nullptr, nullptr, nullptr, + kNoCompression, nullptr, &event_logger, false, + true /* sync_output_directory */, + true /* write_manifest */, Env::Priority::USER); + { + InstrumentedMutexLock l(&mutex_); + flush_job.PickMemTable(); + ASSERT_OK(flush_job.Run()); + } + job_context.Clean(); +} + +TEST_F(FlushJobTest, NonEmpty) { + JobContext job_context(0); + auto cfd = versions_->GetColumnFamilySet()->GetDefault(); + auto new_mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions(), + kMaxSequenceNumber); + new_mem->Ref(); + auto inserted_keys = mock::MakeMockFile(); + // Test data: + // seqno [ 1, 2 ... 8998, 8999, 9000, 9001, 9002 ... 9999 ] + // key [ 1001, 1002 ... 9998, 9999, 0, 1, 2 ... 999 ] + // range-delete "9995" -> "9999" at seqno 10000 + // blob references with seqnos 10001..10006 + for (int i = 1; i < 10000; ++i) { + std::string key(ToString((i + 1000) % 10000)); + std::string value("value" + key); + new_mem->Add(SequenceNumber(i), kTypeValue, key, value); + if ((i + 1000) % 10000 < 9995) { + InternalKey internal_key(key, SequenceNumber(i), kTypeValue); + inserted_keys.insert({internal_key.Encode().ToString(), value}); + } + } + + { + new_mem->Add(SequenceNumber(10000), kTypeRangeDeletion, "9995", "9999a"); + InternalKey internal_key("9995", SequenceNumber(10000), kTypeRangeDeletion); + inserted_keys.insert({internal_key.Encode().ToString(), "9999a"}); + } + +#ifndef ROCKSDB_LITE + // Note: the first two blob references will not be considered when resolving + // the oldest blob file referenced (the first one is inlined TTL, while the + // second one is TTL and thus points to a TTL blob file). + constexpr std::array blob_file_numbers{ + kInvalidBlobFileNumber, 5, 103, 17, 102, 101}; + for (size_t i = 0; i < blob_file_numbers.size(); ++i) { + std::string key(ToString(i + 10001)); + std::string blob_index; + if (i == 0) { + BlobIndex::EncodeInlinedTTL(&blob_index, /* expiration */ 1234567890ULL, + "foo"); + } else if (i == 1) { + BlobIndex::EncodeBlobTTL(&blob_index, /* expiration */ 1234567890ULL, + blob_file_numbers[i], /* offset */ i << 10, + /* size */ i << 20, kNoCompression); + } else { + BlobIndex::EncodeBlob(&blob_index, blob_file_numbers[i], + /* offset */ i << 10, /* size */ i << 20, + kNoCompression); + } + + const SequenceNumber seq(i + 10001); + new_mem->Add(seq, kTypeBlobIndex, key, blob_index); + + InternalKey internal_key(key, seq, kTypeBlobIndex); + inserted_keys.emplace_hint(inserted_keys.end(), + internal_key.Encode().ToString(), blob_index); + } +#endif + + autovector to_delete; + cfd->imm()->Add(new_mem, &to_delete); + for (auto& m : to_delete) { + delete m; + } + + EventLogger event_logger(db_options_.info_log.get()); + SnapshotChecker* snapshot_checker = nullptr; // not relavant + FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(), + db_options_, *cfd->GetLatestMutableCFOptions(), + nullptr /* memtable_id */, env_options_, versions_.get(), + &mutex_, &shutting_down_, {}, kMaxSequenceNumber, + snapshot_checker, &job_context, nullptr, nullptr, nullptr, + kNoCompression, db_options_.statistics.get(), + &event_logger, true, true /* sync_output_directory */, + true /* write_manifest */, Env::Priority::USER); + + HistogramData hist; + FileMetaData file_meta; + mutex_.Lock(); + flush_job.PickMemTable(); + ASSERT_OK(flush_job.Run(nullptr, &file_meta)); + mutex_.Unlock(); + db_options_.statistics->histogramData(FLUSH_TIME, &hist); + ASSERT_GT(hist.average, 0.0); + + ASSERT_EQ(ToString(0), file_meta.smallest.user_key().ToString()); + ASSERT_EQ("9999a", file_meta.largest.user_key().ToString()); + ASSERT_EQ(1, file_meta.fd.smallest_seqno); +#ifndef ROCKSDB_LITE + ASSERT_EQ(10006, file_meta.fd.largest_seqno); + ASSERT_EQ(17, file_meta.oldest_blob_file_number); +#else + ASSERT_EQ(10000, file_meta.fd.largest_seqno); +#endif + mock_table_factory_->AssertSingleFile(inserted_keys); + job_context.Clean(); +} + +TEST_F(FlushJobTest, FlushMemTablesSingleColumnFamily) { + const size_t num_mems = 2; + const size_t num_mems_to_flush = 1; + const size_t num_keys_per_table = 100; + JobContext job_context(0); + ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault(); + std::vector memtable_ids; + std::vector new_mems; + for (size_t i = 0; i != num_mems; ++i) { + MemTable* mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions(), + kMaxSequenceNumber); + mem->SetID(i); + mem->Ref(); + new_mems.emplace_back(mem); + memtable_ids.push_back(mem->GetID()); + + for (size_t j = 0; j < num_keys_per_table; ++j) { + std::string key(ToString(j + i * num_keys_per_table)); + std::string value("value" + key); + mem->Add(SequenceNumber(j + i * num_keys_per_table), kTypeValue, key, + value); + } + } + + autovector to_delete; + for (auto mem : new_mems) { + cfd->imm()->Add(mem, &to_delete); + } + + EventLogger event_logger(db_options_.info_log.get()); + SnapshotChecker* snapshot_checker = nullptr; // not relavant + + assert(memtable_ids.size() == num_mems); + uint64_t smallest_memtable_id = memtable_ids.front(); + uint64_t flush_memtable_id = smallest_memtable_id + num_mems_to_flush - 1; + + FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(), + db_options_, *cfd->GetLatestMutableCFOptions(), + &flush_memtable_id, env_options_, versions_.get(), &mutex_, + &shutting_down_, {}, kMaxSequenceNumber, snapshot_checker, + &job_context, nullptr, nullptr, nullptr, kNoCompression, + db_options_.statistics.get(), &event_logger, true, + true /* sync_output_directory */, + true /* write_manifest */, Env::Priority::USER); + HistogramData hist; + FileMetaData file_meta; + mutex_.Lock(); + flush_job.PickMemTable(); + ASSERT_OK(flush_job.Run(nullptr /* prep_tracker */, &file_meta)); + mutex_.Unlock(); + db_options_.statistics->histogramData(FLUSH_TIME, &hist); + ASSERT_GT(hist.average, 0.0); + + ASSERT_EQ(ToString(0), file_meta.smallest.user_key().ToString()); + ASSERT_EQ("99", file_meta.largest.user_key().ToString()); + ASSERT_EQ(0, file_meta.fd.smallest_seqno); + ASSERT_EQ(SequenceNumber(num_mems_to_flush * num_keys_per_table - 1), + file_meta.fd.largest_seqno); + ASSERT_EQ(kInvalidBlobFileNumber, file_meta.oldest_blob_file_number); + + for (auto m : to_delete) { + delete m; + } + to_delete.clear(); + job_context.Clean(); +} + +TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) { + autovector all_cfds; + for (auto cfd : *versions_->GetColumnFamilySet()) { + all_cfds.push_back(cfd); + } + const std::vector num_memtables = {2, 1, 3}; + assert(num_memtables.size() == column_family_names_.size()); + const size_t num_keys_per_memtable = 1000; + JobContext job_context(0); + std::vector memtable_ids; + std::vector smallest_seqs; + std::vector largest_seqs; + autovector to_delete; + SequenceNumber curr_seqno = 0; + size_t k = 0; + for (auto cfd : all_cfds) { + smallest_seqs.push_back(curr_seqno); + for (size_t i = 0; i != num_memtables[k]; ++i) { + MemTable* mem = cfd->ConstructNewMemtable( + *cfd->GetLatestMutableCFOptions(), kMaxSequenceNumber); + mem->SetID(i); + mem->Ref(); + + for (size_t j = 0; j != num_keys_per_memtable; ++j) { + std::string key(ToString(j + i * num_keys_per_memtable)); + std::string value("value" + key); + mem->Add(curr_seqno++, kTypeValue, key, value); + } + + cfd->imm()->Add(mem, &to_delete); + } + largest_seqs.push_back(curr_seqno - 1); + memtable_ids.push_back(num_memtables[k++] - 1); + } + + EventLogger event_logger(db_options_.info_log.get()); + SnapshotChecker* snapshot_checker = nullptr; // not relevant + std::vector> flush_jobs; + k = 0; + for (auto cfd : all_cfds) { + std::vector snapshot_seqs; + flush_jobs.emplace_back(new FlushJob( + dbname_, cfd, db_options_, *cfd->GetLatestMutableCFOptions(), + &memtable_ids[k], env_options_, versions_.get(), &mutex_, + &shutting_down_, snapshot_seqs, kMaxSequenceNumber, snapshot_checker, + &job_context, nullptr, nullptr, nullptr, kNoCompression, + db_options_.statistics.get(), &event_logger, true, + false /* sync_output_directory */, false /* write_manifest */, + Env::Priority::USER)); + k++; + } + HistogramData hist; + std::vector file_metas; + // Call reserve to avoid auto-resizing + file_metas.reserve(flush_jobs.size()); + mutex_.Lock(); + for (auto& job : flush_jobs) { + job->PickMemTable(); + } + for (auto& job : flush_jobs) { + FileMetaData meta; + // Run will release and re-acquire mutex + ASSERT_OK(job->Run(nullptr /**/, &meta)); + file_metas.emplace_back(meta); + } + autovector file_meta_ptrs; + for (auto& meta : file_metas) { + file_meta_ptrs.push_back(&meta); + } + autovector*> mems_list; + for (size_t i = 0; i != all_cfds.size(); ++i) { + const auto& mems = flush_jobs[i]->GetMemTables(); + mems_list.push_back(&mems); + } + autovector mutable_cf_options_list; + for (auto cfd : all_cfds) { + mutable_cf_options_list.push_back(cfd->GetLatestMutableCFOptions()); + } + + Status s = InstallMemtableAtomicFlushResults( + nullptr /* imm_lists */, all_cfds, mutable_cf_options_list, mems_list, + versions_.get(), &mutex_, file_meta_ptrs, &job_context.memtables_to_free, + nullptr /* db_directory */, nullptr /* log_buffer */); + ASSERT_OK(s); + + mutex_.Unlock(); + db_options_.statistics->histogramData(FLUSH_TIME, &hist); + ASSERT_GT(hist.average, 0.0); + k = 0; + for (const auto& file_meta : file_metas) { + ASSERT_EQ(ToString(0), file_meta.smallest.user_key().ToString()); + ASSERT_EQ("999", file_meta.largest.user_key() + .ToString()); // max key by bytewise comparator + ASSERT_EQ(smallest_seqs[k], file_meta.fd.smallest_seqno); + ASSERT_EQ(largest_seqs[k], file_meta.fd.largest_seqno); + // Verify that imm is empty + ASSERT_EQ(std::numeric_limits::max(), + all_cfds[k]->imm()->GetEarliestMemTableID()); + ASSERT_EQ(0, all_cfds[k]->imm()->GetLatestMemTableID()); + ++k; + } + + for (auto m : to_delete) { + delete m; + } + to_delete.clear(); + job_context.Clean(); +} + +TEST_F(FlushJobTest, Snapshots) { + JobContext job_context(0); + auto cfd = versions_->GetColumnFamilySet()->GetDefault(); + auto new_mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions(), + kMaxSequenceNumber); + + std::set snapshots_set; + int keys = 10000; + int max_inserts_per_keys = 8; + + Random rnd(301); + for (int i = 0; i < keys / 2; ++i) { + snapshots_set.insert(rnd.Uniform(keys * (max_inserts_per_keys / 2)) + 1); + } + // set has already removed the duplicate snapshots + std::vector snapshots(snapshots_set.begin(), + snapshots_set.end()); + + new_mem->Ref(); + SequenceNumber current_seqno = 0; + auto inserted_keys = mock::MakeMockFile(); + for (int i = 1; i < keys; ++i) { + std::string key(ToString(i)); + int insertions = rnd.Uniform(max_inserts_per_keys); + for (int j = 0; j < insertions; ++j) { + std::string value(test::RandomHumanReadableString(&rnd, 10)); + auto seqno = ++current_seqno; + new_mem->Add(SequenceNumber(seqno), kTypeValue, key, value); + // a key is visible only if: + // 1. it's the last one written (j == insertions - 1) + // 2. there's a snapshot pointing at it + bool visible = (j == insertions - 1) || + (snapshots_set.find(seqno) != snapshots_set.end()); + if (visible) { + InternalKey internal_key(key, seqno, kTypeValue); + inserted_keys.insert({internal_key.Encode().ToString(), value}); + } + } + } + + autovector to_delete; + cfd->imm()->Add(new_mem, &to_delete); + for (auto& m : to_delete) { + delete m; + } + + EventLogger event_logger(db_options_.info_log.get()); + SnapshotChecker* snapshot_checker = nullptr; // not relavant + FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(), + db_options_, *cfd->GetLatestMutableCFOptions(), + nullptr /* memtable_id */, env_options_, versions_.get(), + &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber, + snapshot_checker, &job_context, nullptr, nullptr, nullptr, + kNoCompression, db_options_.statistics.get(), + &event_logger, true, true /* sync_output_directory */, + true /* write_manifest */, Env::Priority::USER); + mutex_.Lock(); + flush_job.PickMemTable(); + ASSERT_OK(flush_job.Run()); + mutex_.Unlock(); + mock_table_factory_->AssertSingleFile(inserted_keys); + HistogramData hist; + db_options_.statistics->histogramData(FLUSH_TIME, &hist); + ASSERT_GT(hist.average, 0.0); + job_context.Clean(); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/flush_scheduler.cc b/src/rocksdb/db/flush_scheduler.cc new file mode 100644 index 000000000..6f4d3e1a5 --- /dev/null +++ b/src/rocksdb/db/flush_scheduler.cc @@ -0,0 +1,86 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/flush_scheduler.h" + +#include + +#include "db/column_family.h" + +namespace ROCKSDB_NAMESPACE { + +void FlushScheduler::ScheduleWork(ColumnFamilyData* cfd) { +#ifndef NDEBUG + { + std::lock_guard lock(checking_mutex_); + assert(checking_set_.count(cfd) == 0); + checking_set_.insert(cfd); + } +#endif // NDEBUG + cfd->Ref(); +// Suppress false positive clang analyzer warnings. +#ifndef __clang_analyzer__ + Node* node = new Node{cfd, head_.load(std::memory_order_relaxed)}; + while (!head_.compare_exchange_strong( + node->next, node, std::memory_order_relaxed, std::memory_order_relaxed)) { + // failing CAS updates the first param, so we are already set for + // retry. TakeNextColumnFamily won't happen until after another + // inter-thread synchronization, so we don't even need release + // semantics for this CAS + } +#endif // __clang_analyzer__ +} + +ColumnFamilyData* FlushScheduler::TakeNextColumnFamily() { + while (true) { + if (head_.load(std::memory_order_relaxed) == nullptr) { + return nullptr; + } + + // dequeue the head + Node* node = head_.load(std::memory_order_relaxed); + head_.store(node->next, std::memory_order_relaxed); + ColumnFamilyData* cfd = node->column_family; + delete node; + +#ifndef NDEBUG + { + std::lock_guard lock(checking_mutex_); + auto iter = checking_set_.find(cfd); + assert(iter != checking_set_.end()); + checking_set_.erase(iter); + } +#endif // NDEBUG + + if (!cfd->IsDropped()) { + // success + return cfd; + } + + // no longer relevant, retry + cfd->UnrefAndTryDelete(); + } +} + +bool FlushScheduler::Empty() { + auto rv = head_.load(std::memory_order_relaxed) == nullptr; +#ifndef NDEBUG + std::lock_guard lock(checking_mutex_); + // Empty is allowed to be called concurrnetly with ScheduleFlush. It would + // only miss the recent schedules. + assert((rv == checking_set_.empty()) || rv); +#endif // NDEBUG + return rv; +} + +void FlushScheduler::Clear() { + ColumnFamilyData* cfd; + while ((cfd = TakeNextColumnFamily()) != nullptr) { + cfd->UnrefAndTryDelete(); + } + assert(head_.load(std::memory_order_relaxed) == nullptr); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/flush_scheduler.h b/src/rocksdb/db/flush_scheduler.h new file mode 100644 index 000000000..cbe17994f --- /dev/null +++ b/src/rocksdb/db/flush_scheduler.h @@ -0,0 +1,54 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +class ColumnFamilyData; + +// FlushScheduler keeps track of all column families whose memtable may +// be full and require flushing. Unless otherwise noted, all methods on +// FlushScheduler should be called only with the DB mutex held or from +// a single-threaded recovery context. +class FlushScheduler { + public: + FlushScheduler() : head_(nullptr) {} + + // May be called from multiple threads at once, but not concurrent with + // any other method calls on this instance + void ScheduleWork(ColumnFamilyData* cfd); + + // Removes and returns Ref()-ed column family. Client needs to Unref(). + // Filters column families that have been dropped. + ColumnFamilyData* TakeNextColumnFamily(); + + // This can be called concurrently with ScheduleWork but it would miss all + // the scheduled flushes after the last synchronization. This would result + // into less precise enforcement of memtable sizes but should not matter much. + bool Empty(); + + void Clear(); + + private: + struct Node { + ColumnFamilyData* column_family; + Node* next; + }; + + std::atomic head_; +#ifndef NDEBUG + std::mutex checking_mutex_; + std::set checking_set_; +#endif // NDEBUG +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/forward_iterator.cc b/src/rocksdb/db/forward_iterator.cc new file mode 100644 index 000000000..f2b882549 --- /dev/null +++ b/src/rocksdb/db/forward_iterator.cc @@ -0,0 +1,975 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE +#include "db/forward_iterator.h" + +#include +#include +#include + +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "db/db_iter.h" +#include "db/dbformat.h" +#include "db/job_context.h" +#include "db/range_del_aggregator.h" +#include "db/range_tombstone_fragmenter.h" +#include "rocksdb/env.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "table/merging_iterator.h" +#include "test_util/sync_point.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +// Usage: +// ForwardLevelIterator iter; +// iter.SetFileIndex(file_index); +// iter.Seek(target); // or iter.SeekToFirst(); +// iter.Next() +class ForwardLevelIterator : public InternalIterator { + public: + ForwardLevelIterator(const ColumnFamilyData* const cfd, + const ReadOptions& read_options, + const std::vector& files, + const SliceTransform* prefix_extractor) + : cfd_(cfd), + read_options_(read_options), + files_(files), + valid_(false), + file_index_(std::numeric_limits::max()), + file_iter_(nullptr), + pinned_iters_mgr_(nullptr), + prefix_extractor_(prefix_extractor) {} + + ~ForwardLevelIterator() override { + // Reset current pointer + if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) { + pinned_iters_mgr_->PinIterator(file_iter_); + } else { + delete file_iter_; + } + } + + void SetFileIndex(uint32_t file_index) { + assert(file_index < files_.size()); + status_ = Status::OK(); + if (file_index != file_index_) { + file_index_ = file_index; + Reset(); + } + } + void Reset() { + assert(file_index_ < files_.size()); + + // Reset current pointer + if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) { + pinned_iters_mgr_->PinIterator(file_iter_); + } else { + delete file_iter_; + } + + ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(), + kMaxSequenceNumber /* upper_bound */); + file_iter_ = cfd_->table_cache()->NewIterator( + read_options_, *(cfd_->soptions()), cfd_->internal_comparator(), + *files_[file_index_], + read_options_.ignore_range_deletions ? nullptr : &range_del_agg, + prefix_extractor_, /*table_reader_ptr=*/nullptr, + /*file_read_hist=*/nullptr, TableReaderCaller::kUserIterator, + /*arena=*/nullptr, /*skip_filters=*/false, /*level=*/-1, + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr); + file_iter_->SetPinnedItersMgr(pinned_iters_mgr_); + valid_ = false; + if (!range_del_agg.IsEmpty()) { + status_ = Status::NotSupported( + "Range tombstones unsupported with ForwardIterator"); + } + } + void SeekToLast() override { + status_ = Status::NotSupported("ForwardLevelIterator::SeekToLast()"); + valid_ = false; + } + void Prev() override { + status_ = Status::NotSupported("ForwardLevelIterator::Prev()"); + valid_ = false; + } + bool Valid() const override { + return valid_; + } + void SeekToFirst() override { + assert(file_iter_ != nullptr); + if (!status_.ok()) { + assert(!valid_); + return; + } + file_iter_->SeekToFirst(); + valid_ = file_iter_->Valid(); + } + void Seek(const Slice& internal_key) override { + assert(file_iter_ != nullptr); + + // This deviates from the usual convention for InternalIterator::Seek() in + // that it doesn't discard pre-existing error status. That's because this + // Seek() is only supposed to be called immediately after SetFileIndex() + // (which discards pre-existing error status), and SetFileIndex() may set + // an error status, which we shouldn't discard. + if (!status_.ok()) { + assert(!valid_); + return; + } + + file_iter_->Seek(internal_key); + valid_ = file_iter_->Valid(); + } + void SeekForPrev(const Slice& /*internal_key*/) override { + status_ = Status::NotSupported("ForwardLevelIterator::SeekForPrev()"); + valid_ = false; + } + void Next() override { + assert(valid_); + file_iter_->Next(); + for (;;) { + valid_ = file_iter_->Valid(); + if (!file_iter_->status().ok()) { + assert(!valid_); + return; + } + if (valid_) { + return; + } + if (file_index_ + 1 >= files_.size()) { + valid_ = false; + return; + } + SetFileIndex(file_index_ + 1); + if (!status_.ok()) { + assert(!valid_); + return; + } + file_iter_->SeekToFirst(); + } + } + Slice key() const override { + assert(valid_); + return file_iter_->key(); + } + Slice value() const override { + assert(valid_); + return file_iter_->value(); + } + Status status() const override { + if (!status_.ok()) { + return status_; + } else if (file_iter_) { + return file_iter_->status(); + } + return Status::OK(); + } + bool IsKeyPinned() const override { + return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && + file_iter_->IsKeyPinned(); + } + bool IsValuePinned() const override { + return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && + file_iter_->IsValuePinned(); + } + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { + pinned_iters_mgr_ = pinned_iters_mgr; + if (file_iter_) { + file_iter_->SetPinnedItersMgr(pinned_iters_mgr_); + } + } + + private: + const ColumnFamilyData* const cfd_; + const ReadOptions& read_options_; + const std::vector& files_; + + bool valid_; + uint32_t file_index_; + Status status_; + InternalIterator* file_iter_; + PinnedIteratorsManager* pinned_iters_mgr_; + const SliceTransform* prefix_extractor_; +}; + +ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options, + ColumnFamilyData* cfd, + SuperVersion* current_sv) + : db_(db), + read_options_(read_options), + cfd_(cfd), + prefix_extractor_(current_sv->mutable_cf_options.prefix_extractor.get()), + user_comparator_(cfd->user_comparator()), + immutable_min_heap_(MinIterComparator(&cfd_->internal_comparator())), + sv_(current_sv), + mutable_iter_(nullptr), + current_(nullptr), + valid_(false), + status_(Status::OK()), + immutable_status_(Status::OK()), + has_iter_trimmed_for_upper_bound_(false), + current_over_upper_bound_(false), + is_prev_set_(false), + is_prev_inclusive_(false), + pinned_iters_mgr_(nullptr) { + if (sv_) { + RebuildIterators(false); + } +} + +ForwardIterator::~ForwardIterator() { + Cleanup(true); +} + +void ForwardIterator::SVCleanup(DBImpl* db, SuperVersion* sv, + bool background_purge_on_iterator_cleanup) { + if (sv->Unref()) { + // Job id == 0 means that this is not our background process, but rather + // user thread + JobContext job_context(0); + db->mutex_.Lock(); + sv->Cleanup(); + db->FindObsoleteFiles(&job_context, false, true); + if (background_purge_on_iterator_cleanup) { + db->ScheduleBgLogWriterClose(&job_context); + db->AddSuperVersionsToFreeQueue(sv); + db->SchedulePurge(); + } + db->mutex_.Unlock(); + if (!background_purge_on_iterator_cleanup) { + delete sv; + } + if (job_context.HaveSomethingToDelete()) { + db->PurgeObsoleteFiles(job_context, background_purge_on_iterator_cleanup); + } + job_context.Clean(); + } +} + +namespace { +struct SVCleanupParams { + DBImpl* db; + SuperVersion* sv; + bool background_purge_on_iterator_cleanup; +}; +} + +// Used in PinnedIteratorsManager to release pinned SuperVersion +void ForwardIterator::DeferredSVCleanup(void* arg) { + auto d = reinterpret_cast(arg); + ForwardIterator::SVCleanup( + d->db, d->sv, d->background_purge_on_iterator_cleanup); + delete d; +} + +void ForwardIterator::SVCleanup() { + if (sv_ == nullptr) { + return; + } + bool background_purge = + read_options_.background_purge_on_iterator_cleanup || + db_->immutable_db_options().avoid_unnecessary_blocking_io; + if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) { + // pinned_iters_mgr_ tells us to make sure that all visited key-value slices + // are alive until pinned_iters_mgr_->ReleasePinnedData() is called. + // The slices may point into some memtables owned by sv_, so we need to keep + // sv_ referenced until pinned_iters_mgr_ unpins everything. + auto p = new SVCleanupParams{db_, sv_, background_purge}; + pinned_iters_mgr_->PinPtr(p, &ForwardIterator::DeferredSVCleanup); + } else { + SVCleanup(db_, sv_, background_purge); + } +} + +void ForwardIterator::Cleanup(bool release_sv) { + if (mutable_iter_ != nullptr) { + DeleteIterator(mutable_iter_, true /* is_arena */); + } + + for (auto* m : imm_iters_) { + DeleteIterator(m, true /* is_arena */); + } + imm_iters_.clear(); + + for (auto* f : l0_iters_) { + DeleteIterator(f); + } + l0_iters_.clear(); + + for (auto* l : level_iters_) { + DeleteIterator(l); + } + level_iters_.clear(); + + if (release_sv) { + SVCleanup(); + } +} + +bool ForwardIterator::Valid() const { + // See UpdateCurrent(). + return valid_ ? !current_over_upper_bound_ : false; +} + +void ForwardIterator::SeekToFirst() { + if (sv_ == nullptr) { + RebuildIterators(true); + } else if (sv_->version_number != cfd_->GetSuperVersionNumber()) { + RenewIterators(); + } else if (immutable_status_.IsIncomplete()) { + ResetIncompleteIterators(); + } + SeekInternal(Slice(), true); +} + +bool ForwardIterator::IsOverUpperBound(const Slice& internal_key) const { + return !(read_options_.iterate_upper_bound == nullptr || + cfd_->internal_comparator().user_comparator()->Compare( + ExtractUserKey(internal_key), + *read_options_.iterate_upper_bound) < 0); +} + +void ForwardIterator::Seek(const Slice& internal_key) { + if (sv_ == nullptr) { + RebuildIterators(true); + } else if (sv_->version_number != cfd_->GetSuperVersionNumber()) { + RenewIterators(); + } else if (immutable_status_.IsIncomplete()) { + ResetIncompleteIterators(); + } + SeekInternal(internal_key, false); +} + +void ForwardIterator::SeekInternal(const Slice& internal_key, + bool seek_to_first) { + assert(mutable_iter_); + // mutable + seek_to_first ? mutable_iter_->SeekToFirst() : + mutable_iter_->Seek(internal_key); + + // immutable + // TODO(ljin): NeedToSeekImmutable has negative impact on performance + // if it turns to need to seek immutable often. We probably want to have + // an option to turn it off. + if (seek_to_first || NeedToSeekImmutable(internal_key)) { + immutable_status_ = Status::OK(); + if (has_iter_trimmed_for_upper_bound_ && + ( + // prev_ is not set yet + is_prev_set_ == false || + // We are doing SeekToFirst() and internal_key.size() = 0 + seek_to_first || + // prev_key_ > internal_key + cfd_->internal_comparator().InternalKeyComparator::Compare( + prev_key_.GetInternalKey(), internal_key) > 0)) { + // Some iterators are trimmed. Need to rebuild. + RebuildIterators(true); + // Already seeked mutable iter, so seek again + seek_to_first ? mutable_iter_->SeekToFirst() + : mutable_iter_->Seek(internal_key); + } + { + auto tmp = MinIterHeap(MinIterComparator(&cfd_->internal_comparator())); + immutable_min_heap_.swap(tmp); + } + for (size_t i = 0; i < imm_iters_.size(); i++) { + auto* m = imm_iters_[i]; + seek_to_first ? m->SeekToFirst() : m->Seek(internal_key); + if (!m->status().ok()) { + immutable_status_ = m->status(); + } else if (m->Valid()) { + immutable_min_heap_.push(m); + } + } + + Slice target_user_key; + if (!seek_to_first) { + target_user_key = ExtractUserKey(internal_key); + } + const VersionStorageInfo* vstorage = sv_->current->storage_info(); + const std::vector& l0 = vstorage->LevelFiles(0); + for (size_t i = 0; i < l0.size(); ++i) { + if (!l0_iters_[i]) { + continue; + } + if (seek_to_first) { + l0_iters_[i]->SeekToFirst(); + } else { + // If the target key passes over the larget key, we are sure Next() + // won't go over this file. + if (user_comparator_->Compare(target_user_key, + l0[i]->largest.user_key()) > 0) { + if (read_options_.iterate_upper_bound != nullptr) { + has_iter_trimmed_for_upper_bound_ = true; + DeleteIterator(l0_iters_[i]); + l0_iters_[i] = nullptr; + } + continue; + } + l0_iters_[i]->Seek(internal_key); + } + + if (!l0_iters_[i]->status().ok()) { + immutable_status_ = l0_iters_[i]->status(); + } else if (l0_iters_[i]->Valid() && + !IsOverUpperBound(l0_iters_[i]->key())) { + immutable_min_heap_.push(l0_iters_[i]); + } else { + has_iter_trimmed_for_upper_bound_ = true; + DeleteIterator(l0_iters_[i]); + l0_iters_[i] = nullptr; + } + } + + for (int32_t level = 1; level < vstorage->num_levels(); ++level) { + const std::vector& level_files = + vstorage->LevelFiles(level); + if (level_files.empty()) { + continue; + } + if (level_iters_[level - 1] == nullptr) { + continue; + } + uint32_t f_idx = 0; + if (!seek_to_first) { + f_idx = FindFileInRange(level_files, internal_key, 0, + static_cast(level_files.size())); + } + + // Seek + if (f_idx < level_files.size()) { + level_iters_[level - 1]->SetFileIndex(f_idx); + seek_to_first ? level_iters_[level - 1]->SeekToFirst() : + level_iters_[level - 1]->Seek(internal_key); + + if (!level_iters_[level - 1]->status().ok()) { + immutable_status_ = level_iters_[level - 1]->status(); + } else if (level_iters_[level - 1]->Valid() && + !IsOverUpperBound(level_iters_[level - 1]->key())) { + immutable_min_heap_.push(level_iters_[level - 1]); + } else { + // Nothing in this level is interesting. Remove. + has_iter_trimmed_for_upper_bound_ = true; + DeleteIterator(level_iters_[level - 1]); + level_iters_[level - 1] = nullptr; + } + } + } + + if (seek_to_first) { + is_prev_set_ = false; + } else { + prev_key_.SetInternalKey(internal_key); + is_prev_set_ = true; + is_prev_inclusive_ = true; + } + + TEST_SYNC_POINT_CALLBACK("ForwardIterator::SeekInternal:Immutable", this); + } else if (current_ && current_ != mutable_iter_) { + // current_ is one of immutable iterators, push it back to the heap + immutable_min_heap_.push(current_); + } + + UpdateCurrent(); + TEST_SYNC_POINT_CALLBACK("ForwardIterator::SeekInternal:Return", this); +} + +void ForwardIterator::Next() { + assert(valid_); + bool update_prev_key = false; + + if (sv_ == nullptr || + sv_->version_number != cfd_->GetSuperVersionNumber()) { + std::string current_key = key().ToString(); + Slice old_key(current_key.data(), current_key.size()); + + if (sv_ == nullptr) { + RebuildIterators(true); + } else { + RenewIterators(); + } + SeekInternal(old_key, false); + if (!valid_ || key().compare(old_key) != 0) { + return; + } + } else if (current_ != mutable_iter_) { + // It is going to advance immutable iterator + + if (is_prev_set_ && prefix_extractor_) { + // advance prev_key_ to current_ only if they share the same prefix + update_prev_key = + prefix_extractor_->Transform(prev_key_.GetUserKey()) + .compare(prefix_extractor_->Transform(current_->key())) == 0; + } else { + update_prev_key = true; + } + + + if (update_prev_key) { + prev_key_.SetInternalKey(current_->key()); + is_prev_set_ = true; + is_prev_inclusive_ = false; + } + } + + current_->Next(); + if (current_ != mutable_iter_) { + if (!current_->status().ok()) { + immutable_status_ = current_->status(); + } else if ((current_->Valid()) && (!IsOverUpperBound(current_->key()))) { + immutable_min_heap_.push(current_); + } else { + if ((current_->Valid()) && (IsOverUpperBound(current_->key()))) { + // remove the current iterator + DeleteCurrentIter(); + current_ = nullptr; + } + if (update_prev_key) { + mutable_iter_->Seek(prev_key_.GetInternalKey()); + } + } + } + UpdateCurrent(); + TEST_SYNC_POINT_CALLBACK("ForwardIterator::Next:Return", this); +} + +Slice ForwardIterator::key() const { + assert(valid_); + return current_->key(); +} + +Slice ForwardIterator::value() const { + assert(valid_); + return current_->value(); +} + +Status ForwardIterator::status() const { + if (!status_.ok()) { + return status_; + } else if (!mutable_iter_->status().ok()) { + return mutable_iter_->status(); + } + + return immutable_status_; +} + +Status ForwardIterator::GetProperty(std::string prop_name, std::string* prop) { + assert(prop != nullptr); + if (prop_name == "rocksdb.iterator.super-version-number") { + *prop = ToString(sv_->version_number); + return Status::OK(); + } + return Status::InvalidArgument(); +} + +void ForwardIterator::SetPinnedItersMgr( + PinnedIteratorsManager* pinned_iters_mgr) { + pinned_iters_mgr_ = pinned_iters_mgr; + UpdateChildrenPinnedItersMgr(); +} + +void ForwardIterator::UpdateChildrenPinnedItersMgr() { + // Set PinnedIteratorsManager for mutable memtable iterator. + if (mutable_iter_) { + mutable_iter_->SetPinnedItersMgr(pinned_iters_mgr_); + } + + // Set PinnedIteratorsManager for immutable memtable iterators. + for (InternalIterator* child_iter : imm_iters_) { + if (child_iter) { + child_iter->SetPinnedItersMgr(pinned_iters_mgr_); + } + } + + // Set PinnedIteratorsManager for L0 files iterators. + for (InternalIterator* child_iter : l0_iters_) { + if (child_iter) { + child_iter->SetPinnedItersMgr(pinned_iters_mgr_); + } + } + + // Set PinnedIteratorsManager for L1+ levels iterators. + for (ForwardLevelIterator* child_iter : level_iters_) { + if (child_iter) { + child_iter->SetPinnedItersMgr(pinned_iters_mgr_); + } + } +} + +bool ForwardIterator::IsKeyPinned() const { + return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && + current_->IsKeyPinned(); +} + +bool ForwardIterator::IsValuePinned() const { + return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && + current_->IsValuePinned(); +} + +void ForwardIterator::RebuildIterators(bool refresh_sv) { + // Clean up + Cleanup(refresh_sv); + if (refresh_sv) { + // New + sv_ = cfd_->GetReferencedSuperVersion(db_); + } + ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(), + kMaxSequenceNumber /* upper_bound */); + mutable_iter_ = sv_->mem->NewIterator(read_options_, &arena_); + sv_->imm->AddIterators(read_options_, &imm_iters_, &arena_); + if (!read_options_.ignore_range_deletions) { + std::unique_ptr range_del_iter( + sv_->mem->NewRangeTombstoneIterator( + read_options_, sv_->current->version_set()->LastSequence())); + range_del_agg.AddTombstones(std::move(range_del_iter)); + sv_->imm->AddRangeTombstoneIterators(read_options_, &arena_, + &range_del_agg); + } + has_iter_trimmed_for_upper_bound_ = false; + + const auto* vstorage = sv_->current->storage_info(); + const auto& l0_files = vstorage->LevelFiles(0); + l0_iters_.reserve(l0_files.size()); + for (const auto* l0 : l0_files) { + if ((read_options_.iterate_upper_bound != nullptr) && + cfd_->internal_comparator().user_comparator()->Compare( + l0->smallest.user_key(), *read_options_.iterate_upper_bound) > 0) { + // No need to set has_iter_trimmed_for_upper_bound_: this ForwardIterator + // will never be interested in files with smallest key above + // iterate_upper_bound, since iterate_upper_bound can't be changed. + l0_iters_.push_back(nullptr); + continue; + } + l0_iters_.push_back(cfd_->table_cache()->NewIterator( + read_options_, *cfd_->soptions(), cfd_->internal_comparator(), *l0, + read_options_.ignore_range_deletions ? nullptr : &range_del_agg, + sv_->mutable_cf_options.prefix_extractor.get(), + /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr, + TableReaderCaller::kUserIterator, /*arena=*/nullptr, + /*skip_filters=*/false, /*level=*/-1, + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr)); + } + BuildLevelIterators(vstorage); + current_ = nullptr; + is_prev_set_ = false; + + UpdateChildrenPinnedItersMgr(); + if (!range_del_agg.IsEmpty()) { + status_ = Status::NotSupported( + "Range tombstones unsupported with ForwardIterator"); + valid_ = false; + } +} + +void ForwardIterator::RenewIterators() { + SuperVersion* svnew; + assert(sv_); + svnew = cfd_->GetReferencedSuperVersion(db_); + + if (mutable_iter_ != nullptr) { + DeleteIterator(mutable_iter_, true /* is_arena */); + } + for (auto* m : imm_iters_) { + DeleteIterator(m, true /* is_arena */); + } + imm_iters_.clear(); + + mutable_iter_ = svnew->mem->NewIterator(read_options_, &arena_); + svnew->imm->AddIterators(read_options_, &imm_iters_, &arena_); + ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(), + kMaxSequenceNumber /* upper_bound */); + if (!read_options_.ignore_range_deletions) { + std::unique_ptr range_del_iter( + svnew->mem->NewRangeTombstoneIterator( + read_options_, sv_->current->version_set()->LastSequence())); + range_del_agg.AddTombstones(std::move(range_del_iter)); + svnew->imm->AddRangeTombstoneIterators(read_options_, &arena_, + &range_del_agg); + } + + const auto* vstorage = sv_->current->storage_info(); + const auto& l0_files = vstorage->LevelFiles(0); + const auto* vstorage_new = svnew->current->storage_info(); + const auto& l0_files_new = vstorage_new->LevelFiles(0); + size_t iold, inew; + bool found; + std::vector l0_iters_new; + l0_iters_new.reserve(l0_files_new.size()); + + for (inew = 0; inew < l0_files_new.size(); inew++) { + found = false; + for (iold = 0; iold < l0_files.size(); iold++) { + if (l0_files[iold] == l0_files_new[inew]) { + found = true; + break; + } + } + if (found) { + if (l0_iters_[iold] == nullptr) { + l0_iters_new.push_back(nullptr); + TEST_SYNC_POINT_CALLBACK("ForwardIterator::RenewIterators:Null", this); + } else { + l0_iters_new.push_back(l0_iters_[iold]); + l0_iters_[iold] = nullptr; + TEST_SYNC_POINT_CALLBACK("ForwardIterator::RenewIterators:Copy", this); + } + continue; + } + l0_iters_new.push_back(cfd_->table_cache()->NewIterator( + read_options_, *cfd_->soptions(), cfd_->internal_comparator(), + *l0_files_new[inew], + read_options_.ignore_range_deletions ? nullptr : &range_del_agg, + svnew->mutable_cf_options.prefix_extractor.get(), + /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr, + TableReaderCaller::kUserIterator, /*arena=*/nullptr, + /*skip_filters=*/false, /*level=*/-1, + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr)); + } + + for (auto* f : l0_iters_) { + DeleteIterator(f); + } + l0_iters_.clear(); + l0_iters_ = l0_iters_new; + + for (auto* l : level_iters_) { + DeleteIterator(l); + } + level_iters_.clear(); + BuildLevelIterators(vstorage_new); + current_ = nullptr; + is_prev_set_ = false; + SVCleanup(); + sv_ = svnew; + + UpdateChildrenPinnedItersMgr(); + if (!range_del_agg.IsEmpty()) { + status_ = Status::NotSupported( + "Range tombstones unsupported with ForwardIterator"); + valid_ = false; + } +} + +void ForwardIterator::BuildLevelIterators(const VersionStorageInfo* vstorage) { + level_iters_.reserve(vstorage->num_levels() - 1); + for (int32_t level = 1; level < vstorage->num_levels(); ++level) { + const auto& level_files = vstorage->LevelFiles(level); + if ((level_files.empty()) || + ((read_options_.iterate_upper_bound != nullptr) && + (user_comparator_->Compare(*read_options_.iterate_upper_bound, + level_files[0]->smallest.user_key()) < + 0))) { + level_iters_.push_back(nullptr); + if (!level_files.empty()) { + has_iter_trimmed_for_upper_bound_ = true; + } + } else { + level_iters_.push_back(new ForwardLevelIterator( + cfd_, read_options_, level_files, + sv_->mutable_cf_options.prefix_extractor.get())); + } + } +} + +void ForwardIterator::ResetIncompleteIterators() { + const auto& l0_files = sv_->current->storage_info()->LevelFiles(0); + for (size_t i = 0; i < l0_iters_.size(); ++i) { + assert(i < l0_files.size()); + if (!l0_iters_[i] || !l0_iters_[i]->status().IsIncomplete()) { + continue; + } + DeleteIterator(l0_iters_[i]); + l0_iters_[i] = cfd_->table_cache()->NewIterator( + read_options_, *cfd_->soptions(), cfd_->internal_comparator(), + *l0_files[i], /*range_del_agg=*/nullptr, + sv_->mutable_cf_options.prefix_extractor.get(), + /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr, + TableReaderCaller::kUserIterator, /*arena=*/nullptr, + /*skip_filters=*/false, /*level=*/-1, + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr); + l0_iters_[i]->SetPinnedItersMgr(pinned_iters_mgr_); + } + + for (auto* level_iter : level_iters_) { + if (level_iter && level_iter->status().IsIncomplete()) { + level_iter->Reset(); + } + } + + current_ = nullptr; + is_prev_set_ = false; +} + +void ForwardIterator::UpdateCurrent() { + if (immutable_min_heap_.empty() && !mutable_iter_->Valid()) { + current_ = nullptr; + } else if (immutable_min_heap_.empty()) { + current_ = mutable_iter_; + } else if (!mutable_iter_->Valid()) { + current_ = immutable_min_heap_.top(); + immutable_min_heap_.pop(); + } else { + current_ = immutable_min_heap_.top(); + assert(current_ != nullptr); + assert(current_->Valid()); + int cmp = cfd_->internal_comparator().InternalKeyComparator::Compare( + mutable_iter_->key(), current_->key()); + assert(cmp != 0); + if (cmp > 0) { + immutable_min_heap_.pop(); + } else { + current_ = mutable_iter_; + } + } + valid_ = current_ != nullptr && immutable_status_.ok(); + if (!status_.ok()) { + status_ = Status::OK(); + } + + // Upper bound doesn't apply to the memtable iterator. We want Valid() to + // return false when all iterators are over iterate_upper_bound, but can't + // just set valid_ to false, as that would effectively disable the tailing + // optimization (Seek() would be called on all immutable iterators regardless + // of whether the target key is greater than prev_key_). + current_over_upper_bound_ = valid_ && IsOverUpperBound(current_->key()); +} + +bool ForwardIterator::NeedToSeekImmutable(const Slice& target) { + // We maintain the interval (prev_key_, immutable_min_heap_.top()->key()) + // such that there are no records with keys within that range in + // immutable_min_heap_. Since immutable structures (SST files and immutable + // memtables) can't change in this version, we don't need to do a seek if + // 'target' belongs to that interval (immutable_min_heap_.top() is already + // at the correct position). + + if (!valid_ || !current_ || !is_prev_set_ || !immutable_status_.ok()) { + return true; + } + Slice prev_key = prev_key_.GetInternalKey(); + if (prefix_extractor_ && prefix_extractor_->Transform(target).compare( + prefix_extractor_->Transform(prev_key)) != 0) { + return true; + } + if (cfd_->internal_comparator().InternalKeyComparator::Compare( + prev_key, target) >= (is_prev_inclusive_ ? 1 : 0)) { + return true; + } + + if (immutable_min_heap_.empty() && current_ == mutable_iter_) { + // Nothing to seek on. + return false; + } + if (cfd_->internal_comparator().InternalKeyComparator::Compare( + target, current_ == mutable_iter_ ? immutable_min_heap_.top()->key() + : current_->key()) > 0) { + return true; + } + return false; +} + +void ForwardIterator::DeleteCurrentIter() { + const VersionStorageInfo* vstorage = sv_->current->storage_info(); + const std::vector& l0 = vstorage->LevelFiles(0); + for (size_t i = 0; i < l0.size(); ++i) { + if (!l0_iters_[i]) { + continue; + } + if (l0_iters_[i] == current_) { + has_iter_trimmed_for_upper_bound_ = true; + DeleteIterator(l0_iters_[i]); + l0_iters_[i] = nullptr; + return; + } + } + + for (int32_t level = 1; level < vstorage->num_levels(); ++level) { + if (level_iters_[level - 1] == nullptr) { + continue; + } + if (level_iters_[level - 1] == current_) { + has_iter_trimmed_for_upper_bound_ = true; + DeleteIterator(level_iters_[level - 1]); + level_iters_[level - 1] = nullptr; + } + } +} + +bool ForwardIterator::TEST_CheckDeletedIters(int* pdeleted_iters, + int* pnum_iters) { + bool retval = false; + int deleted_iters = 0; + int num_iters = 0; + + const VersionStorageInfo* vstorage = sv_->current->storage_info(); + const std::vector& l0 = vstorage->LevelFiles(0); + for (size_t i = 0; i < l0.size(); ++i) { + if (!l0_iters_[i]) { + retval = true; + deleted_iters++; + } else { + num_iters++; + } + } + + for (int32_t level = 1; level < vstorage->num_levels(); ++level) { + if ((level_iters_[level - 1] == nullptr) && + (!vstorage->LevelFiles(level).empty())) { + retval = true; + deleted_iters++; + } else if (!vstorage->LevelFiles(level).empty()) { + num_iters++; + } + } + if ((!retval) && num_iters <= 1) { + retval = true; + } + if (pdeleted_iters) { + *pdeleted_iters = deleted_iters; + } + if (pnum_iters) { + *pnum_iters = num_iters; + } + return retval; +} + +uint32_t ForwardIterator::FindFileInRange( + const std::vector& files, const Slice& internal_key, + uint32_t left, uint32_t right) { + auto cmp = [&](const FileMetaData* f, const Slice& key) -> bool { + return cfd_->internal_comparator().InternalKeyComparator::Compare( + f->largest.Encode(), key) < 0; + }; + const auto &b = files.begin(); + return static_cast(std::lower_bound(b + left, + b + right, internal_key, cmp) - b); +} + +void ForwardIterator::DeleteIterator(InternalIterator* iter, bool is_arena) { + if (iter == nullptr) { + return; + } + + if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) { + pinned_iters_mgr_->PinIterator(iter, is_arena); + } else { + if (is_arena) { + iter->~InternalIterator(); + } else { + delete iter; + } + } +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/db/forward_iterator.h b/src/rocksdb/db/forward_iterator.h new file mode 100644 index 000000000..8c671c75f --- /dev/null +++ b/src/rocksdb/db/forward_iterator.h @@ -0,0 +1,160 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include +#include + +#include "db/dbformat.h" +#include "memory/arena.h" +#include "rocksdb/db.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "table/internal_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +class DBImpl; +class Env; +struct SuperVersion; +class ColumnFamilyData; +class ForwardLevelIterator; +class VersionStorageInfo; +struct FileMetaData; + +class MinIterComparator { + public: + explicit MinIterComparator(const Comparator* comparator) : + comparator_(comparator) {} + + bool operator()(InternalIterator* a, InternalIterator* b) { + return comparator_->Compare(a->key(), b->key()) > 0; + } + private: + const Comparator* comparator_; +}; + +typedef std::priority_queue, + MinIterComparator> MinIterHeap; + +/** + * ForwardIterator is a special type of iterator that only supports Seek() + * and Next(). It is expected to perform better than TailingIterator by + * removing the encapsulation and making all information accessible within + * the iterator. At the current implementation, snapshot is taken at the + * time Seek() is called. The Next() followed do not see new values after. + */ +class ForwardIterator : public InternalIterator { + public: + ForwardIterator(DBImpl* db, const ReadOptions& read_options, + ColumnFamilyData* cfd, SuperVersion* current_sv = nullptr); + virtual ~ForwardIterator(); + + void SeekForPrev(const Slice& /*target*/) override { + status_ = Status::NotSupported("ForwardIterator::SeekForPrev()"); + valid_ = false; + } + void SeekToLast() override { + status_ = Status::NotSupported("ForwardIterator::SeekToLast()"); + valid_ = false; + } + void Prev() override { + status_ = Status::NotSupported("ForwardIterator::Prev"); + valid_ = false; + } + + virtual bool Valid() const override; + void SeekToFirst() override; + virtual void Seek(const Slice& target) override; + virtual void Next() override; + virtual Slice key() const override; + virtual Slice value() const override; + virtual Status status() const override; + virtual Status GetProperty(std::string prop_name, std::string* prop) override; + virtual void SetPinnedItersMgr( + PinnedIteratorsManager* pinned_iters_mgr) override; + virtual bool IsKeyPinned() const override; + virtual bool IsValuePinned() const override; + + bool TEST_CheckDeletedIters(int* deleted_iters, int* num_iters); + + private: + void Cleanup(bool release_sv); + // Unreference and, if needed, clean up the current SuperVersion. This is + // either done immediately or deferred until this iterator is unpinned by + // PinnedIteratorsManager. + void SVCleanup(); + static void SVCleanup( + DBImpl* db, SuperVersion* sv, bool background_purge_on_iterator_cleanup); + static void DeferredSVCleanup(void* arg); + + void RebuildIterators(bool refresh_sv); + void RenewIterators(); + void BuildLevelIterators(const VersionStorageInfo* vstorage); + void ResetIncompleteIterators(); + void SeekInternal(const Slice& internal_key, bool seek_to_first); + void UpdateCurrent(); + bool NeedToSeekImmutable(const Slice& internal_key); + void DeleteCurrentIter(); + uint32_t FindFileInRange( + const std::vector& files, const Slice& internal_key, + uint32_t left, uint32_t right); + + bool IsOverUpperBound(const Slice& internal_key) const; + + // Set PinnedIteratorsManager for all children Iterators, this function should + // be called whenever we update children Iterators or pinned_iters_mgr_. + void UpdateChildrenPinnedItersMgr(); + + // A helper function that will release iter in the proper manner, or pass it + // to pinned_iters_mgr_ to release it later if pinning is enabled. + void DeleteIterator(InternalIterator* iter, bool is_arena = false); + + DBImpl* const db_; + const ReadOptions read_options_; + ColumnFamilyData* const cfd_; + const SliceTransform* const prefix_extractor_; + const Comparator* user_comparator_; + MinIterHeap immutable_min_heap_; + + SuperVersion* sv_; + InternalIterator* mutable_iter_; + std::vector imm_iters_; + std::vector l0_iters_; + std::vector level_iters_; + InternalIterator* current_; + bool valid_; + + // Internal iterator status; set only by one of the unsupported methods. + Status status_; + // Status of immutable iterators, maintained here to avoid iterating over + // all of them in status(). + Status immutable_status_; + // Indicates that at least one of the immutable iterators pointed to a key + // larger than iterate_upper_bound and was therefore destroyed. Seek() may + // need to rebuild such iterators. + bool has_iter_trimmed_for_upper_bound_; + // Is current key larger than iterate_upper_bound? If so, makes Valid() + // return false. + bool current_over_upper_bound_; + + // Left endpoint of the range of keys that immutable iterators currently + // cover. When Seek() is called with a key that's within that range, immutable + // iterators don't need to be moved; see NeedToSeekImmutable(). This key is + // included in the range after a Seek(), but excluded when advancing the + // iterator using Next(). + IterKey prev_key_; + bool is_prev_set_; + bool is_prev_inclusive_; + + PinnedIteratorsManager* pinned_iters_mgr_; + Arena arena_; +}; + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/db/forward_iterator_bench.cc b/src/rocksdb/db/forward_iterator_bench.cc new file mode 100644 index 000000000..6f1223537 --- /dev/null +++ b/src/rocksdb/db/forward_iterator_bench.cc @@ -0,0 +1,377 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#if !defined(GFLAGS) || defined(ROCKSDB_LITE) +#include +int main() { + fprintf(stderr, "Please install gflags to run rocksdb tools\n"); + return 1; +} +#elif defined(OS_MACOSX) || defined(OS_WIN) +// Block forward_iterator_bench under MAC and Windows +int main() { return 0; } +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "port/port.h" +#include "rocksdb/cache.h" +#include "rocksdb/db.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "test_util/testharness.h" +#include "util/gflags_compat.h" + +const int MAX_SHARDS = 100000; + +DEFINE_int32(writers, 8, ""); +DEFINE_int32(readers, 8, ""); +DEFINE_int64(rate, 100000, ""); +DEFINE_int64(value_size, 300, ""); +DEFINE_int64(shards, 1000, ""); +DEFINE_int64(memtable_size, 500000000, ""); +DEFINE_int64(block_cache_size, 300000000, ""); +DEFINE_int64(block_size, 65536, ""); +DEFINE_double(runtime, 300.0, ""); +DEFINE_bool(cache_only_first, true, ""); +DEFINE_bool(iterate_upper_bound, true, ""); + +struct Stats { + char pad1[128] __attribute__((__unused__)); + std::atomic written{0}; + char pad2[128] __attribute__((__unused__)); + std::atomic read{0}; + std::atomic cache_misses{0}; + char pad3[128] __attribute__((__unused__)); +} stats; + +struct Key { + Key() {} + Key(uint64_t shard_in, uint64_t seqno_in) + : shard_be(htobe64(shard_in)), seqno_be(htobe64(seqno_in)) {} + + uint64_t shard() const { return be64toh(shard_be); } + uint64_t seqno() const { return be64toh(seqno_be); } + + private: + uint64_t shard_be; + uint64_t seqno_be; +} __attribute__((__packed__)); + +struct Reader; +struct Writer; + +struct ShardState { + char pad1[128] __attribute__((__unused__)); + std::atomic last_written{0}; + Writer* writer; + Reader* reader; + char pad2[128] __attribute__((__unused__)); + std::atomic last_read{0}; + std::unique_ptr it; + std::unique_ptr it_cacheonly; + Key upper_bound; + ROCKSDB_NAMESPACE::Slice upper_bound_slice; + char pad3[128] __attribute__((__unused__)); +}; + +struct Reader { + public: + explicit Reader(std::vector* shard_states, + ROCKSDB_NAMESPACE::DB* db) + : shard_states_(shard_states), db_(db) { + sem_init(&sem_, 0, 0); + thread_ = port::Thread(&Reader::run, this); + } + + void run() { + while (1) { + sem_wait(&sem_); + if (done_.load()) { + break; + } + + uint64_t shard; + { + std::lock_guard guard(queue_mutex_); + assert(!shards_pending_queue_.empty()); + shard = shards_pending_queue_.front(); + shards_pending_queue_.pop(); + shards_pending_set_.reset(shard); + } + readOnceFromShard(shard); + } + } + + void readOnceFromShard(uint64_t shard) { + ShardState& state = (*shard_states_)[shard]; + if (!state.it) { + // Initialize iterators + ROCKSDB_NAMESPACE::ReadOptions options; + options.tailing = true; + if (FLAGS_iterate_upper_bound) { + state.upper_bound = Key(shard, std::numeric_limits::max()); + state.upper_bound_slice = ROCKSDB_NAMESPACE::Slice( + (const char*)&state.upper_bound, sizeof(state.upper_bound)); + options.iterate_upper_bound = &state.upper_bound_slice; + } + + state.it.reset(db_->NewIterator(options)); + + if (FLAGS_cache_only_first) { + options.read_tier = ROCKSDB_NAMESPACE::ReadTier::kBlockCacheTier; + state.it_cacheonly.reset(db_->NewIterator(options)); + } + } + + const uint64_t upto = state.last_written.load(); + for (ROCKSDB_NAMESPACE::Iterator* it : + {state.it_cacheonly.get(), state.it.get()}) { + if (it == nullptr) { + continue; + } + if (state.last_read.load() >= upto) { + break; + } + bool need_seek = true; + for (uint64_t seq = state.last_read.load() + 1; seq <= upto; ++seq) { + if (need_seek) { + Key from(shard, state.last_read.load() + 1); + it->Seek(ROCKSDB_NAMESPACE::Slice((const char*)&from, sizeof(from))); + need_seek = false; + } else { + it->Next(); + } + if (it->status().IsIncomplete()) { + ++::stats.cache_misses; + break; + } + assert(it->Valid()); + assert(it->key().size() == sizeof(Key)); + Key key; + memcpy(&key, it->key().data(), it->key().size()); + // fprintf(stderr, "Expecting (%ld, %ld) read (%ld, %ld)\n", + // shard, seq, key.shard(), key.seqno()); + assert(key.shard() == shard); + assert(key.seqno() == seq); + state.last_read.store(seq); + ++::stats.read; + } + } + } + + void onWrite(uint64_t shard) { + { + std::lock_guard guard(queue_mutex_); + if (!shards_pending_set_.test(shard)) { + shards_pending_queue_.push(shard); + shards_pending_set_.set(shard); + sem_post(&sem_); + } + } + } + + ~Reader() { + done_.store(true); + sem_post(&sem_); + thread_.join(); + } + + private: + char pad1[128] __attribute__((__unused__)); + std::vector* shard_states_; + ROCKSDB_NAMESPACE::DB* db_; + ROCKSDB_NAMESPACE::port::Thread thread_; + sem_t sem_; + std::mutex queue_mutex_; + std::bitset shards_pending_set_; + std::queue shards_pending_queue_; + std::atomic done_{false}; + char pad2[128] __attribute__((__unused__)); +}; + +struct Writer { + explicit Writer(std::vector* shard_states, + ROCKSDB_NAMESPACE::DB* db) + : shard_states_(shard_states), db_(db) {} + + void start() { thread_ = port::Thread(&Writer::run, this); } + + void run() { + std::queue workq; + std::chrono::steady_clock::time_point deadline( + std::chrono::steady_clock::now() + + std::chrono::nanoseconds((uint64_t)(1000000000 * FLAGS_runtime))); + std::vector my_shards; + for (int i = 1; i <= FLAGS_shards; ++i) { + if ((*shard_states_)[i].writer == this) { + my_shards.push_back(i); + } + } + + std::mt19937 rng{std::random_device()()}; + std::uniform_int_distribution shard_dist( + 0, static_cast(my_shards.size()) - 1); + std::string value(FLAGS_value_size, '*'); + + while (1) { + auto now = std::chrono::steady_clock::now(); + if (FLAGS_runtime >= 0 && now >= deadline) { + break; + } + if (workq.empty()) { + for (int i = 0; i < FLAGS_rate; i += FLAGS_writers) { + std::chrono::nanoseconds offset(1000000000LL * i / FLAGS_rate); + workq.push(now + offset); + } + } + while (!workq.empty() && workq.front() < now) { + workq.pop(); + uint64_t shard = my_shards[shard_dist(rng)]; + ShardState& state = (*shard_states_)[shard]; + uint64_t seqno = state.last_written.load() + 1; + Key key(shard, seqno); + // fprintf(stderr, "Writing (%ld, %ld)\n", shard, seqno); + ROCKSDB_NAMESPACE::Status status = + db_->Put(ROCKSDB_NAMESPACE::WriteOptions(), + ROCKSDB_NAMESPACE::Slice((const char*)&key, sizeof(key)), + ROCKSDB_NAMESPACE::Slice(value)); + assert(status.ok()); + state.last_written.store(seqno); + state.reader->onWrite(shard); + ++::stats.written; + } + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + // fprintf(stderr, "Writer done\n"); + } + + ~Writer() { thread_.join(); } + + private: + char pad1[128] __attribute__((__unused__)); + std::vector* shard_states_; + ROCKSDB_NAMESPACE::DB* db_; + ROCKSDB_NAMESPACE::port::Thread thread_; + char pad2[128] __attribute__((__unused__)); +}; + +struct StatsThread { + explicit StatsThread(ROCKSDB_NAMESPACE::DB* db) + : db_(db), thread_(&StatsThread::run, this) {} + + void run() { + // using namespace std::chrono; + auto tstart = std::chrono::steady_clock::now(), tlast = tstart; + uint64_t wlast = 0, rlast = 0; + while (!done_.load()) { + { + std::unique_lock lock(cvm_); + cv_.wait_for(lock, std::chrono::seconds(1)); + } + auto now = std::chrono::steady_clock::now(); + double elapsed = + std::chrono::duration_cast >( + now - tlast).count(); + uint64_t w = ::stats.written.load(); + uint64_t r = ::stats.read.load(); + fprintf(stderr, + "%s elapsed %4lds | written %10ld | w/s %10.0f | read %10ld | " + "r/s %10.0f | cache misses %10ld\n", + db_->GetEnv()->TimeToString(time(nullptr)).c_str(), + std::chrono::duration_cast(now - tstart) + .count(), + w, (w - wlast) / elapsed, r, (r - rlast) / elapsed, + ::stats.cache_misses.load()); + wlast = w; + rlast = r; + tlast = now; + } + } + + ~StatsThread() { + { + std::lock_guard guard(cvm_); + done_.store(true); + } + cv_.notify_all(); + thread_.join(); + } + + private: + ROCKSDB_NAMESPACE::DB* db_; + std::mutex cvm_; + std::condition_variable cv_; + ROCKSDB_NAMESPACE::port::Thread thread_; + std::atomic done_{false}; +}; + +int main(int argc, char** argv) { + GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true); + + std::mt19937 rng{std::random_device()()}; + ROCKSDB_NAMESPACE::Status status; + std::string path = + ROCKSDB_NAMESPACE::test::PerThreadDBPath("forward_iterator_test"); + fprintf(stderr, "db path is %s\n", path.c_str()); + ROCKSDB_NAMESPACE::Options options; + options.create_if_missing = true; + options.compression = ROCKSDB_NAMESPACE::CompressionType::kNoCompression; + options.compaction_style = + ROCKSDB_NAMESPACE::CompactionStyle::kCompactionStyleNone; + options.level0_slowdown_writes_trigger = 99999; + options.level0_stop_writes_trigger = 99999; + options.use_direct_io_for_flush_and_compaction = true; + options.write_buffer_size = FLAGS_memtable_size; + ROCKSDB_NAMESPACE::BlockBasedTableOptions table_options; + table_options.block_cache = + ROCKSDB_NAMESPACE::NewLRUCache(FLAGS_block_cache_size); + table_options.block_size = FLAGS_block_size; + options.table_factory.reset( + ROCKSDB_NAMESPACE::NewBlockBasedTableFactory(table_options)); + + status = ROCKSDB_NAMESPACE::DestroyDB(path, options); + assert(status.ok()); + ROCKSDB_NAMESPACE::DB* db_raw; + status = ROCKSDB_NAMESPACE::DB::Open(options, path, &db_raw); + assert(status.ok()); + std::unique_ptr db(db_raw); + + std::vector shard_states(FLAGS_shards + 1); + std::deque readers; + while (static_cast(readers.size()) < FLAGS_readers) { + readers.emplace_back(&shard_states, db_raw); + } + std::deque writers; + while (static_cast(writers.size()) < FLAGS_writers) { + writers.emplace_back(&shard_states, db_raw); + } + + // Each shard gets a random reader and random writer assigned to it + for (int i = 1; i <= FLAGS_shards; ++i) { + std::uniform_int_distribution reader_dist(0, FLAGS_readers - 1); + std::uniform_int_distribution writer_dist(0, FLAGS_writers - 1); + shard_states[i].reader = &readers[reader_dist(rng)]; + shard_states[i].writer = &writers[writer_dist(rng)]; + } + + StatsThread stats_thread(db_raw); + for (Writer& w : writers) { + w.start(); + } + + writers.clear(); + readers.clear(); +} +#endif // !defined(GFLAGS) || defined(ROCKSDB_LITE) diff --git a/src/rocksdb/db/import_column_family_job.cc b/src/rocksdb/db/import_column_family_job.cc new file mode 100644 index 000000000..15af1cf80 --- /dev/null +++ b/src/rocksdb/db/import_column_family_job.cc @@ -0,0 +1,276 @@ +#ifndef ROCKSDB_LITE + +#include "db/import_column_family_job.h" + +#include +#include +#include +#include + +#include "db/version_edit.h" +#include "file/file_util.h" +#include "file/random_access_file_reader.h" +#include "table/merging_iterator.h" +#include "table/scoped_arena_iterator.h" +#include "table/sst_file_writer_collectors.h" +#include "table/table_builder.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { + +Status ImportColumnFamilyJob::Prepare(uint64_t next_file_number, + SuperVersion* sv) { + Status status; + + // Read the information of files we are importing + for (const auto& file_metadata : metadata_) { + const auto file_path = file_metadata.db_path + "/" + file_metadata.name; + IngestedFileInfo file_to_import; + status = GetIngestedFileInfo(file_path, &file_to_import, sv); + if (!status.ok()) { + return status; + } + files_to_import_.push_back(file_to_import); + } + + const auto ucmp = cfd_->internal_comparator().user_comparator(); + auto num_files = files_to_import_.size(); + if (num_files == 0) { + return Status::InvalidArgument("The list of files is empty"); + } else if (num_files > 1) { + // Verify that passed files don't have overlapping ranges in any particular + // level. + int min_level = 1; // Check for overlaps in Level 1 and above. + int max_level = -1; + for (const auto& file_metadata : metadata_) { + if (file_metadata.level > max_level) { + max_level = file_metadata.level; + } + } + for (int level = min_level; level <= max_level; ++level) { + autovector sorted_files; + for (size_t i = 0; i < num_files; i++) { + if (metadata_[i].level == level) { + sorted_files.push_back(&files_to_import_[i]); + } + } + + std::sort(sorted_files.begin(), sorted_files.end(), + [&ucmp](const IngestedFileInfo* info1, + const IngestedFileInfo* info2) { + return sstableKeyCompare(ucmp, info1->smallest_internal_key, + info2->smallest_internal_key) < 0; + }); + + for (size_t i = 0; i < sorted_files.size() - 1; i++) { + if (sstableKeyCompare(ucmp, sorted_files[i]->largest_internal_key, + sorted_files[i + 1]->smallest_internal_key) >= + 0) { + return Status::InvalidArgument("Files have overlapping ranges"); + } + } + } + } + + for (const auto& f : files_to_import_) { + if (f.num_entries == 0) { + return Status::InvalidArgument("File contain no entries"); + } + + if (!f.smallest_internal_key.Valid() || !f.largest_internal_key.Valid()) { + return Status::Corruption("File has corrupted keys"); + } + } + + // Copy/Move external files into DB + auto hardlink_files = import_options_.move_files; + for (auto& f : files_to_import_) { + f.fd = FileDescriptor(next_file_number++, 0, f.file_size); + + const auto path_outside_db = f.external_file_path; + const auto path_inside_db = TableFileName( + cfd_->ioptions()->cf_paths, f.fd.GetNumber(), f.fd.GetPathId()); + + if (hardlink_files) { + status = + fs_->LinkFile(path_outside_db, path_inside_db, IOOptions(), nullptr); + if (status.IsNotSupported()) { + // Original file is on a different FS, use copy instead of hard linking + hardlink_files = false; + } + } + if (!hardlink_files) { + status = CopyFile(fs_, path_outside_db, path_inside_db, 0, + db_options_.use_fsync); + } + if (!status.ok()) { + break; + } + f.copy_file = !hardlink_files; + f.internal_file_path = path_inside_db; + } + + if (!status.ok()) { + // We failed, remove all files that we copied into the db + for (const auto& f : files_to_import_) { + if (f.internal_file_path.empty()) { + break; + } + const auto s = + fs_->DeleteFile(f.internal_file_path, IOOptions(), nullptr); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "AddFile() clean up for file %s failed : %s", + f.internal_file_path.c_str(), s.ToString().c_str()); + } + } + } + + return status; +} + +// REQUIRES: we have become the only writer by entering both write_thread_ and +// nonmem_write_thread_ +Status ImportColumnFamilyJob::Run() { + Status status; + edit_.SetColumnFamily(cfd_->GetID()); + + // We use the import time as the ancester time. This is the time the data + // is written to the database. + int64_t temp_current_time = 0; + uint64_t oldest_ancester_time = kUnknownOldestAncesterTime; + uint64_t current_time = kUnknownOldestAncesterTime; + if (env_->GetCurrentTime(&temp_current_time).ok()) { + current_time = oldest_ancester_time = + static_cast(temp_current_time); + } + + for (size_t i = 0; i < files_to_import_.size(); ++i) { + const auto& f = files_to_import_[i]; + const auto& file_metadata = metadata_[i]; + + edit_.AddFile(file_metadata.level, f.fd.GetNumber(), f.fd.GetPathId(), + f.fd.GetFileSize(), f.smallest_internal_key, + f.largest_internal_key, file_metadata.smallest_seqno, + file_metadata.largest_seqno, false, kInvalidBlobFileNumber, + oldest_ancester_time, current_time, kUnknownFileChecksum, + kUnknownFileChecksumFuncName); + + // If incoming sequence number is higher, update local sequence number. + if (file_metadata.largest_seqno > versions_->LastSequence()) { + versions_->SetLastAllocatedSequence(file_metadata.largest_seqno); + versions_->SetLastPublishedSequence(file_metadata.largest_seqno); + versions_->SetLastSequence(file_metadata.largest_seqno); + } + } + + return status; +} + +void ImportColumnFamilyJob::Cleanup(const Status& status) { + if (!status.ok()) { + // We failed to add files to the database remove all the files we copied. + for (const auto& f : files_to_import_) { + const auto s = + fs_->DeleteFile(f.internal_file_path, IOOptions(), nullptr); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "AddFile() clean up for file %s failed : %s", + f.internal_file_path.c_str(), s.ToString().c_str()); + } + } + } else if (status.ok() && import_options_.move_files) { + // The files were moved and added successfully, remove original file links + for (IngestedFileInfo& f : files_to_import_) { + const auto s = + fs_->DeleteFile(f.external_file_path, IOOptions(), nullptr); + if (!s.ok()) { + ROCKS_LOG_WARN( + db_options_.info_log, + "%s was added to DB successfully but failed to remove original " + "file link : %s", + f.external_file_path.c_str(), s.ToString().c_str()); + } + } + } +} + +Status ImportColumnFamilyJob::GetIngestedFileInfo( + const std::string& external_file, IngestedFileInfo* file_to_import, + SuperVersion* sv) { + file_to_import->external_file_path = external_file; + + // Get external file size + Status status = fs_->GetFileSize(external_file, IOOptions(), + &file_to_import->file_size, nullptr); + if (!status.ok()) { + return status; + } + + // Create TableReader for external file + std::unique_ptr table_reader; + std::unique_ptr sst_file; + std::unique_ptr sst_file_reader; + + status = fs_->NewRandomAccessFile(external_file, env_options_, + &sst_file, nullptr); + if (!status.ok()) { + return status; + } + sst_file_reader.reset( + new RandomAccessFileReader(std::move(sst_file), external_file)); + + status = cfd_->ioptions()->table_factory->NewTableReader( + TableReaderOptions(*cfd_->ioptions(), + sv->mutable_cf_options.prefix_extractor.get(), + env_options_, cfd_->internal_comparator()), + std::move(sst_file_reader), file_to_import->file_size, &table_reader); + if (!status.ok()) { + return status; + } + + // Get the external file properties + auto props = table_reader->GetTableProperties(); + + // Set original_seqno to 0. + file_to_import->original_seqno = 0; + + // Get number of entries in table + file_to_import->num_entries = props->num_entries; + + ParsedInternalKey key; + ReadOptions ro; + // During reading the external file we can cache blocks that we read into + // the block cache, if we later change the global seqno of this file, we will + // have block in cache that will include keys with wrong seqno. + // We need to disable fill_cache so that we read from the file without + // updating the block cache. + ro.fill_cache = false; + std::unique_ptr iter(table_reader->NewIterator( + ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion)); + + // Get first (smallest) key from file + iter->SeekToFirst(); + if (!ParseInternalKey(iter->key(), &key)) { + return Status::Corruption("external file have corrupted keys"); + } + file_to_import->smallest_internal_key.SetFrom(key); + + // Get last (largest) key from file + iter->SeekToLast(); + if (!ParseInternalKey(iter->key(), &key)) { + return Status::Corruption("external file have corrupted keys"); + } + file_to_import->largest_internal_key.SetFrom(key); + + file_to_import->cf_id = static_cast(props->column_family_id); + + file_to_import->table_properties = *props; + + return status; +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/import_column_family_job.h b/src/rocksdb/db/import_column_family_job.h new file mode 100644 index 000000000..160fd1247 --- /dev/null +++ b/src/rocksdb/db/import_column_family_job.h @@ -0,0 +1,72 @@ +#pragma once +#include +#include +#include + +#include "db/column_family.h" +#include "db/dbformat.h" +#include "db/external_sst_file_ingestion_job.h" +#include "db/snapshot_impl.h" +#include "options/db_options.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/metadata.h" +#include "rocksdb/sst_file_writer.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +// Imports a set of sst files as is into a new column family. Logic is similar +// to ExternalSstFileIngestionJob. +class ImportColumnFamilyJob { + public: + ImportColumnFamilyJob(Env* env, VersionSet* versions, ColumnFamilyData* cfd, + const ImmutableDBOptions& db_options, + const EnvOptions& env_options, + const ImportColumnFamilyOptions& import_options, + const std::vector& metadata) + : env_(env), + versions_(versions), + cfd_(cfd), + db_options_(db_options), + fs_(db_options_.fs.get()), + env_options_(env_options), + import_options_(import_options), + metadata_(metadata) {} + + // Prepare the job by copying external files into the DB. + Status Prepare(uint64_t next_file_number, SuperVersion* sv); + + // Will execute the import job and prepare edit() to be applied. + // REQUIRES: Mutex held + Status Run(); + + // Cleanup after successful/failed job + void Cleanup(const Status& status); + + VersionEdit* edit() { return &edit_; } + + const autovector& files_to_import() const { + return files_to_import_; + } + + private: + // Open the external file and populate `file_to_import` with all the + // external information we need to import this file. + Status GetIngestedFileInfo(const std::string& external_file, + IngestedFileInfo* file_to_import, + SuperVersion* sv); + + Env* env_; + VersionSet* versions_; + ColumnFamilyData* cfd_; + const ImmutableDBOptions& db_options_; + FileSystem* fs_; + const EnvOptions& env_options_; + autovector files_to_import_; + VersionEdit edit_; + const ImportColumnFamilyOptions& import_options_; + std::vector metadata_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/import_column_family_test.cc b/src/rocksdb/db/import_column_family_test.cc new file mode 100644 index 000000000..a25560b7c --- /dev/null +++ b/src/rocksdb/db/import_column_family_test.cc @@ -0,0 +1,567 @@ +#ifndef ROCKSDB_LITE + +#include +#include "db/db_test_util.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/sst_file_writer.h" +#include "test_util/testutil.h" + +namespace ROCKSDB_NAMESPACE { + +class ImportColumnFamilyTest : public DBTestBase { + public: + ImportColumnFamilyTest() : DBTestBase("/import_column_family_test") { + sst_files_dir_ = dbname_ + "/sst_files/"; + DestroyAndRecreateExternalSSTFilesDir(); + export_files_dir_ = test::TmpDir(env_) + "/export"; + import_cfh_ = nullptr; + import_cfh2_ = nullptr; + metadata_ptr_ = nullptr; + } + + ~ImportColumnFamilyTest() { + if (import_cfh_) { + db_->DropColumnFamily(import_cfh_); + db_->DestroyColumnFamilyHandle(import_cfh_); + import_cfh_ = nullptr; + } + if (import_cfh2_) { + db_->DropColumnFamily(import_cfh2_); + db_->DestroyColumnFamilyHandle(import_cfh2_); + import_cfh2_ = nullptr; + } + if (metadata_ptr_) { + delete metadata_ptr_; + metadata_ptr_ = nullptr; + } + test::DestroyDir(env_, sst_files_dir_); + test::DestroyDir(env_, export_files_dir_); + } + + void DestroyAndRecreateExternalSSTFilesDir() { + test::DestroyDir(env_, sst_files_dir_); + env_->CreateDir(sst_files_dir_); + test::DestroyDir(env_, export_files_dir_); + } + + LiveFileMetaData LiveFileMetaDataInit(std::string name, std::string path, + int level, + SequenceNumber smallest_seqno, + SequenceNumber largest_seqno) { + LiveFileMetaData metadata; + metadata.name = name; + metadata.db_path = path; + metadata.smallest_seqno = smallest_seqno; + metadata.largest_seqno = largest_seqno; + metadata.level = level; + return metadata; + } + + protected: + std::string sst_files_dir_; + std::string export_files_dir_; + ColumnFamilyHandle* import_cfh_; + ColumnFamilyHandle* import_cfh2_; + ExportImportFilesMetaData* metadata_ptr_; +}; + +TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFiles) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"koko"}, options); + + SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]); + SstFileWriter sfw_unknown(EnvOptions(), options); + + // cf1.sst + const std::string cf1_sst_name = "cf1.sst"; + const std::string cf1_sst = sst_files_dir_ + cf1_sst_name; + ASSERT_OK(sfw_cf1.Open(cf1_sst)); + ASSERT_OK(sfw_cf1.Put("K1", "V1")); + ASSERT_OK(sfw_cf1.Put("K2", "V2")); + ASSERT_OK(sfw_cf1.Finish()); + + // cf_unknown.sst + const std::string unknown_sst_name = "cf_unknown.sst"; + const std::string unknown_sst = sst_files_dir_ + unknown_sst_name; + ASSERT_OK(sfw_unknown.Open(unknown_sst)); + ASSERT_OK(sfw_unknown.Put("K3", "V1")); + ASSERT_OK(sfw_unknown.Put("K4", "V2")); + ASSERT_OK(sfw_unknown.Finish()); + + { + // Import sst file corresponding to cf1 onto a new cf and verify + ExportImportFilesMetaData metadata; + metadata.files.push_back( + LiveFileMetaDataInit(cf1_sst_name, sst_files_dir_, 0, 10, 19)); + metadata.db_comparator_name = options.comparator->Name(); + + ASSERT_OK(db_->CreateColumnFamilyWithImport( + options, "toto", ImportColumnFamilyOptions(), metadata, &import_cfh_)); + ASSERT_NE(import_cfh_, nullptr); + + std::string value; + db_->Get(ReadOptions(), import_cfh_, "K1", &value); + ASSERT_EQ(value, "V1"); + db_->Get(ReadOptions(), import_cfh_, "K2", &value); + ASSERT_EQ(value, "V2"); + ASSERT_OK(db_->DropColumnFamily(import_cfh_)); + ASSERT_OK(db_->DestroyColumnFamilyHandle(import_cfh_)); + import_cfh_ = nullptr; + } + + { + // Import sst file corresponding to unknown cf onto a new cf and verify + ExportImportFilesMetaData metadata; + metadata.files.push_back( + LiveFileMetaDataInit(unknown_sst_name, sst_files_dir_, 0, 20, 29)); + metadata.db_comparator_name = options.comparator->Name(); + + ASSERT_OK(db_->CreateColumnFamilyWithImport( + options, "yoyo", ImportColumnFamilyOptions(), metadata, &import_cfh_)); + ASSERT_NE(import_cfh_, nullptr); + + std::string value; + db_->Get(ReadOptions(), import_cfh_, "K3", &value); + ASSERT_EQ(value, "V1"); + db_->Get(ReadOptions(), import_cfh_, "K4", &value); + ASSERT_EQ(value, "V2"); + } +} + +TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFilesWithOverlap) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"koko"}, options); + + SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]); + + // file3.sst + const std::string file3_sst_name = "file3.sst"; + const std::string file3_sst = sst_files_dir_ + file3_sst_name; + ASSERT_OK(sfw_cf1.Open(file3_sst)); + for (int i = 0; i < 100; ++i) { + sfw_cf1.Put(Key(i), Key(i) + "_val"); + } + ASSERT_OK(sfw_cf1.Finish()); + + // file2.sst + const std::string file2_sst_name = "file2.sst"; + const std::string file2_sst = sst_files_dir_ + file2_sst_name; + ASSERT_OK(sfw_cf1.Open(file2_sst)); + for (int i = 0; i < 100; i += 2) { + sfw_cf1.Put(Key(i), Key(i) + "_overwrite1"); + } + ASSERT_OK(sfw_cf1.Finish()); + + // file1a.sst + const std::string file1a_sst_name = "file1a.sst"; + const std::string file1a_sst = sst_files_dir_ + file1a_sst_name; + ASSERT_OK(sfw_cf1.Open(file1a_sst)); + for (int i = 0; i < 52; i += 4) { + sfw_cf1.Put(Key(i), Key(i) + "_overwrite2"); + } + ASSERT_OK(sfw_cf1.Finish()); + + // file1b.sst + const std::string file1b_sst_name = "file1b.sst"; + const std::string file1b_sst = sst_files_dir_ + file1b_sst_name; + ASSERT_OK(sfw_cf1.Open(file1b_sst)); + for (int i = 52; i < 100; i += 4) { + sfw_cf1.Put(Key(i), Key(i) + "_overwrite2"); + } + ASSERT_OK(sfw_cf1.Finish()); + + // file0a.sst + const std::string file0a_sst_name = "file0a.sst"; + const std::string file0a_sst = sst_files_dir_ + file0a_sst_name; + ASSERT_OK(sfw_cf1.Open(file0a_sst)); + for (int i = 0; i < 100; i += 16) { + sfw_cf1.Put(Key(i), Key(i) + "_overwrite3"); + } + ASSERT_OK(sfw_cf1.Finish()); + + // file0b.sst + const std::string file0b_sst_name = "file0b.sst"; + const std::string file0b_sst = sst_files_dir_ + file0b_sst_name; + ASSERT_OK(sfw_cf1.Open(file0b_sst)); + for (int i = 0; i < 100; i += 16) { + sfw_cf1.Put(Key(i), Key(i) + "_overwrite4"); + } + ASSERT_OK(sfw_cf1.Finish()); + + // Import sst files and verify + ExportImportFilesMetaData metadata; + metadata.files.push_back( + LiveFileMetaDataInit(file3_sst_name, sst_files_dir_, 3, 10, 19)); + metadata.files.push_back( + LiveFileMetaDataInit(file2_sst_name, sst_files_dir_, 2, 20, 29)); + metadata.files.push_back( + LiveFileMetaDataInit(file1a_sst_name, sst_files_dir_, 1, 30, 34)); + metadata.files.push_back( + LiveFileMetaDataInit(file1b_sst_name, sst_files_dir_, 1, 35, 39)); + metadata.files.push_back( + LiveFileMetaDataInit(file0a_sst_name, sst_files_dir_, 0, 40, 49)); + metadata.files.push_back( + LiveFileMetaDataInit(file0b_sst_name, sst_files_dir_, 0, 50, 59)); + metadata.db_comparator_name = options.comparator->Name(); + + ASSERT_OK(db_->CreateColumnFamilyWithImport( + options, "toto", ImportColumnFamilyOptions(), metadata, &import_cfh_)); + ASSERT_NE(import_cfh_, nullptr); + + for (int i = 0; i < 100; i++) { + std::string value; + db_->Get(ReadOptions(), import_cfh_, Key(i), &value); + if (i % 16 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite4"); + } else if (i % 4 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite2"); + } else if (i % 2 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite1"); + } else { + ASSERT_EQ(value, Key(i) + "_val"); + } + } + + for (int i = 0; i < 100; i += 5) { + ASSERT_OK( + db_->Put(WriteOptions(), import_cfh_, Key(i), Key(i) + "_overwrite5")); + } + + // Flush and check again + ASSERT_OK(db_->Flush(FlushOptions(), import_cfh_)); + for (int i = 0; i < 100; i++) { + std::string value; + db_->Get(ReadOptions(), import_cfh_, Key(i), &value); + if (i % 5 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite5"); + } else if (i % 16 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite4"); + } else if (i % 4 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite2"); + } else if (i % 2 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite1"); + } else { + ASSERT_EQ(value, Key(i) + "_val"); + } + } + + // Compact and check again. + ASSERT_OK( + db_->CompactRange(CompactRangeOptions(), import_cfh_, nullptr, nullptr)); + for (int i = 0; i < 100; i++) { + std::string value; + db_->Get(ReadOptions(), import_cfh_, Key(i), &value); + if (i % 5 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite5"); + } else if (i % 16 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite4"); + } else if (i % 4 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite2"); + } else if (i % 2 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite1"); + } else { + ASSERT_EQ(value, Key(i) + "_val"); + } + } +} + +TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherCF) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"koko"}, options); + + for (int i = 0; i < 100; ++i) { + Put(1, Key(i), Key(i) + "_val"); + } + ASSERT_OK(Flush(1)); + + ASSERT_OK( + db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr)); + + // Overwrite the value in the same set of keys. + for (int i = 0; i < 100; ++i) { + Put(1, Key(i), Key(i) + "_overwrite"); + } + + // Flush to create L0 file. + ASSERT_OK(Flush(1)); + for (int i = 0; i < 100; ++i) { + Put(1, Key(i), Key(i) + "_overwrite2"); + } + + // Flush again to create another L0 file. It should have higher sequencer. + ASSERT_OK(Flush(1)); + + Checkpoint* checkpoint; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_, + &metadata_ptr_)); + ASSERT_NE(metadata_ptr_, nullptr); + delete checkpoint; + + ImportColumnFamilyOptions import_options; + import_options.move_files = false; + ASSERT_OK(db_->CreateColumnFamilyWithImport(options, "toto", import_options, + *metadata_ptr_, &import_cfh_)); + ASSERT_NE(import_cfh_, nullptr); + + import_options.move_files = true; + ASSERT_OK(db_->CreateColumnFamilyWithImport(options, "yoyo", import_options, + *metadata_ptr_, &import_cfh2_)); + ASSERT_NE(import_cfh2_, nullptr); + delete metadata_ptr_; + metadata_ptr_ = NULL; + + std::string value1, value2; + + for (int i = 0; i < 100; ++i) { + db_->Get(ReadOptions(), import_cfh_, Key(i), &value1); + ASSERT_EQ(Get(1, Key(i)), value1); + } + + for (int i = 0; i < 100; ++i) { + db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2); + ASSERT_EQ(Get(1, Key(i)), value2); + } + + // Modify keys in cf1 and verify. + for (int i = 0; i < 25; i++) { + ASSERT_OK(db_->Delete(WriteOptions(), import_cfh_, Key(i))); + } + for (int i = 25; i < 50; i++) { + ASSERT_OK( + db_->Put(WriteOptions(), import_cfh_, Key(i), Key(i) + "_overwrite3")); + } + for (int i = 0; i < 25; ++i) { + ASSERT_TRUE( + db_->Get(ReadOptions(), import_cfh_, Key(i), &value1).IsNotFound()); + } + for (int i = 25; i < 50; ++i) { + db_->Get(ReadOptions(), import_cfh_, Key(i), &value1); + ASSERT_EQ(Key(i) + "_overwrite3", value1); + } + for (int i = 50; i < 100; ++i) { + db_->Get(ReadOptions(), import_cfh_, Key(i), &value1); + ASSERT_EQ(Key(i) + "_overwrite2", value1); + } + + for (int i = 0; i < 100; ++i) { + db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2); + ASSERT_EQ(Get(1, Key(i)), value2); + } + + // Compact and check again. + ASSERT_OK(db_->Flush(FlushOptions(), import_cfh_)); + ASSERT_OK( + db_->CompactRange(CompactRangeOptions(), import_cfh_, nullptr, nullptr)); + + for (int i = 0; i < 25; ++i) { + ASSERT_TRUE( + db_->Get(ReadOptions(), import_cfh_, Key(i), &value1).IsNotFound()); + } + for (int i = 25; i < 50; ++i) { + db_->Get(ReadOptions(), import_cfh_, Key(i), &value1); + ASSERT_EQ(Key(i) + "_overwrite3", value1); + } + for (int i = 50; i < 100; ++i) { + db_->Get(ReadOptions(), import_cfh_, Key(i), &value1); + ASSERT_EQ(Key(i) + "_overwrite2", value1); + } + + for (int i = 0; i < 100; ++i) { + db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2); + ASSERT_EQ(Get(1, Key(i)), value2); + } +} + +TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherDB) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"koko"}, options); + + for (int i = 0; i < 100; ++i) { + Put(1, Key(i), Key(i) + "_val"); + } + ASSERT_OK(Flush(1)); + + // Compact to create a L1 file. + ASSERT_OK( + db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr)); + + // Overwrite the value in the same set of keys. + for (int i = 0; i < 50; ++i) { + Put(1, Key(i), Key(i) + "_overwrite"); + } + + // Flush to create L0 file. + ASSERT_OK(Flush(1)); + + for (int i = 0; i < 25; ++i) { + Put(1, Key(i), Key(i) + "_overwrite2"); + } + + // Flush again to create another L0 file. It should have higher sequencer. + ASSERT_OK(Flush(1)); + + Checkpoint* checkpoint; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_, + &metadata_ptr_)); + ASSERT_NE(metadata_ptr_, nullptr); + delete checkpoint; + + // Create a new db and import the files. + DB* db_copy; + test::DestroyDir(env_, dbname_ + "/db_copy"); + ASSERT_OK(DB::Open(options, dbname_ + "/db_copy", &db_copy)); + ColumnFamilyHandle* cfh = nullptr; + ASSERT_OK(db_copy->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo", + ImportColumnFamilyOptions(), + *metadata_ptr_, &cfh)); + ASSERT_NE(cfh, nullptr); + + for (int i = 0; i < 100; ++i) { + std::string value; + db_copy->Get(ReadOptions(), cfh, Key(i), &value); + ASSERT_EQ(Get(1, Key(i)), value); + } + db_copy->DropColumnFamily(cfh); + db_copy->DestroyColumnFamilyHandle(cfh); + delete db_copy; + test::DestroyDir(env_, dbname_ + "/db_copy"); +} + +TEST_F(ImportColumnFamilyTest, ImportColumnFamilyNegativeTest) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"koko"}, options); + + { + // Create column family with existing cf name. + ExportImportFilesMetaData metadata; + + ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "koko", + ImportColumnFamilyOptions(), + metadata, &import_cfh_), + Status::InvalidArgument("Column family already exists")); + ASSERT_EQ(import_cfh_, nullptr); + } + + { + // Import with no files specified. + ExportImportFilesMetaData metadata; + + ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo", + ImportColumnFamilyOptions(), + metadata, &import_cfh_), + Status::InvalidArgument("The list of files is empty")); + ASSERT_EQ(import_cfh_, nullptr); + } + + { + // Import with overlapping keys in sst files. + ExportImportFilesMetaData metadata; + SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]); + const std::string file1_sst_name = "file1.sst"; + const std::string file1_sst = sst_files_dir_ + file1_sst_name; + ASSERT_OK(sfw_cf1.Open(file1_sst)); + ASSERT_OK(sfw_cf1.Put("K1", "V1")); + ASSERT_OK(sfw_cf1.Put("K2", "V2")); + ASSERT_OK(sfw_cf1.Finish()); + const std::string file2_sst_name = "file2.sst"; + const std::string file2_sst = sst_files_dir_ + file2_sst_name; + ASSERT_OK(sfw_cf1.Open(file2_sst)); + ASSERT_OK(sfw_cf1.Put("K2", "V2")); + ASSERT_OK(sfw_cf1.Put("K3", "V3")); + ASSERT_OK(sfw_cf1.Finish()); + + metadata.files.push_back( + LiveFileMetaDataInit(file1_sst_name, sst_files_dir_, 1, 10, 19)); + metadata.files.push_back( + LiveFileMetaDataInit(file2_sst_name, sst_files_dir_, 1, 10, 19)); + metadata.db_comparator_name = options.comparator->Name(); + + ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo", + ImportColumnFamilyOptions(), + metadata, &import_cfh_), + Status::InvalidArgument("Files have overlapping ranges")); + ASSERT_EQ(import_cfh_, nullptr); + } + + { + // Import with a mismatching comparator, should fail with appropriate error. + ExportImportFilesMetaData metadata; + Options mismatch_options = CurrentOptions(); + mismatch_options.comparator = ReverseBytewiseComparator(); + SstFileWriter sfw_cf1(EnvOptions(), mismatch_options, handles_[1]); + const std::string file1_sst_name = "file1.sst"; + const std::string file1_sst = sst_files_dir_ + file1_sst_name; + ASSERT_OK(sfw_cf1.Open(file1_sst)); + ASSERT_OK(sfw_cf1.Put("K2", "V2")); + ASSERT_OK(sfw_cf1.Put("K1", "V1")); + ASSERT_OK(sfw_cf1.Finish()); + + metadata.files.push_back( + LiveFileMetaDataInit(file1_sst_name, sst_files_dir_, 1, 10, 19)); + metadata.db_comparator_name = mismatch_options.comparator->Name(); + + ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "coco", + ImportColumnFamilyOptions(), + metadata, &import_cfh_), + Status::InvalidArgument("Comparator name mismatch")); + ASSERT_EQ(import_cfh_, nullptr); + } + + { + // Import with non existent sst file should fail with appropriate error + ExportImportFilesMetaData metadata; + SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]); + const std::string file1_sst_name = "file1.sst"; + const std::string file1_sst = sst_files_dir_ + file1_sst_name; + ASSERT_OK(sfw_cf1.Open(file1_sst)); + ASSERT_OK(sfw_cf1.Put("K1", "V1")); + ASSERT_OK(sfw_cf1.Put("K2", "V2")); + ASSERT_OK(sfw_cf1.Finish()); + const std::string file3_sst_name = "file3.sst"; + + metadata.files.push_back( + LiveFileMetaDataInit(file1_sst_name, sst_files_dir_, 1, 10, 19)); + metadata.files.push_back( + LiveFileMetaDataInit(file3_sst_name, sst_files_dir_, 1, 10, 19)); + metadata.db_comparator_name = options.comparator->Name(); + + ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo", + ImportColumnFamilyOptions(), + metadata, &import_cfh_), + Status::IOError("No such file or directory")); + ASSERT_EQ(import_cfh_, nullptr); + + // Test successful import after a failure with the same CF name. Ensures + // there is no side effect with CF when there is a failed import + metadata.files.pop_back(); + metadata.db_comparator_name = options.comparator->Name(); + + ASSERT_OK(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo", + ImportColumnFamilyOptions(), + metadata, &import_cfh_)); + ASSERT_NE(import_cfh_, nullptr); + } +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, + "SKIPPED as External SST File Writer and Import are not supported " + "in ROCKSDB_LITE\n"); + return 0; +} + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/internal_stats.cc b/src/rocksdb/db/internal_stats.cc new file mode 100644 index 000000000..f729ee7c7 --- /dev/null +++ b/src/rocksdb/db/internal_stats.cc @@ -0,0 +1,1424 @@ +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/internal_stats.h" + +#include +#include +#include +#include +#include +#include + +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "table/block_based/block_based_table_factory.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +#ifndef ROCKSDB_LITE + +const std::map InternalStats::compaction_level_stats = + { + {LevelStatType::NUM_FILES, LevelStat{"NumFiles", "Files"}}, + {LevelStatType::COMPACTED_FILES, + LevelStat{"CompactedFiles", "CompactedFiles"}}, + {LevelStatType::SIZE_BYTES, LevelStat{"SizeBytes", "Size"}}, + {LevelStatType::SCORE, LevelStat{"Score", "Score"}}, + {LevelStatType::READ_GB, LevelStat{"ReadGB", "Read(GB)"}}, + {LevelStatType::RN_GB, LevelStat{"RnGB", "Rn(GB)"}}, + {LevelStatType::RNP1_GB, LevelStat{"Rnp1GB", "Rnp1(GB)"}}, + {LevelStatType::WRITE_GB, LevelStat{"WriteGB", "Write(GB)"}}, + {LevelStatType::W_NEW_GB, LevelStat{"WnewGB", "Wnew(GB)"}}, + {LevelStatType::MOVED_GB, LevelStat{"MovedGB", "Moved(GB)"}}, + {LevelStatType::WRITE_AMP, LevelStat{"WriteAmp", "W-Amp"}}, + {LevelStatType::READ_MBPS, LevelStat{"ReadMBps", "Rd(MB/s)"}}, + {LevelStatType::WRITE_MBPS, LevelStat{"WriteMBps", "Wr(MB/s)"}}, + {LevelStatType::COMP_SEC, LevelStat{"CompSec", "Comp(sec)"}}, + {LevelStatType::COMP_CPU_SEC, + LevelStat{"CompMergeCPU", "CompMergeCPU(sec)"}}, + {LevelStatType::COMP_COUNT, LevelStat{"CompCount", "Comp(cnt)"}}, + {LevelStatType::AVG_SEC, LevelStat{"AvgSec", "Avg(sec)"}}, + {LevelStatType::KEY_IN, LevelStat{"KeyIn", "KeyIn"}}, + {LevelStatType::KEY_DROP, LevelStat{"KeyDrop", "KeyDrop"}}, +}; + +namespace { +const double kMB = 1048576.0; +const double kGB = kMB * 1024; +const double kMicrosInSec = 1000000.0; + +void PrintLevelStatsHeader(char* buf, size_t len, const std::string& cf_name, + const std::string& group_by) { + int written_size = + snprintf(buf, len, "\n** Compaction Stats [%s] **\n", cf_name.c_str()); + auto hdr = [](LevelStatType t) { + return InternalStats::compaction_level_stats.at(t).header_name.c_str(); + }; + int line_size = snprintf( + buf + written_size, len - written_size, + "%s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s\n", + // Note that we skip COMPACTED_FILES and merge it with Files column + group_by.c_str(), hdr(LevelStatType::NUM_FILES), + hdr(LevelStatType::SIZE_BYTES), hdr(LevelStatType::SCORE), + hdr(LevelStatType::READ_GB), hdr(LevelStatType::RN_GB), + hdr(LevelStatType::RNP1_GB), hdr(LevelStatType::WRITE_GB), + hdr(LevelStatType::W_NEW_GB), hdr(LevelStatType::MOVED_GB), + hdr(LevelStatType::WRITE_AMP), hdr(LevelStatType::READ_MBPS), + hdr(LevelStatType::WRITE_MBPS), hdr(LevelStatType::COMP_SEC), + hdr(LevelStatType::COMP_CPU_SEC), hdr(LevelStatType::COMP_COUNT), + hdr(LevelStatType::AVG_SEC), hdr(LevelStatType::KEY_IN), + hdr(LevelStatType::KEY_DROP)); + + written_size += line_size; + snprintf(buf + written_size, len - written_size, "%s\n", + std::string(line_size, '-').c_str()); +} + +void PrepareLevelStats(std::map* level_stats, + int num_files, int being_compacted, + double total_file_size, double score, double w_amp, + const InternalStats::CompactionStats& stats) { + uint64_t bytes_read = + stats.bytes_read_non_output_levels + stats.bytes_read_output_level; + int64_t bytes_new = stats.bytes_written - stats.bytes_read_output_level; + double elapsed = (stats.micros + 1) / kMicrosInSec; + + (*level_stats)[LevelStatType::NUM_FILES] = num_files; + (*level_stats)[LevelStatType::COMPACTED_FILES] = being_compacted; + (*level_stats)[LevelStatType::SIZE_BYTES] = total_file_size; + (*level_stats)[LevelStatType::SCORE] = score; + (*level_stats)[LevelStatType::READ_GB] = bytes_read / kGB; + (*level_stats)[LevelStatType::RN_GB] = + stats.bytes_read_non_output_levels / kGB; + (*level_stats)[LevelStatType::RNP1_GB] = stats.bytes_read_output_level / kGB; + (*level_stats)[LevelStatType::WRITE_GB] = stats.bytes_written / kGB; + (*level_stats)[LevelStatType::W_NEW_GB] = bytes_new / kGB; + (*level_stats)[LevelStatType::MOVED_GB] = stats.bytes_moved / kGB; + (*level_stats)[LevelStatType::WRITE_AMP] = w_amp; + (*level_stats)[LevelStatType::READ_MBPS] = bytes_read / kMB / elapsed; + (*level_stats)[LevelStatType::WRITE_MBPS] = + stats.bytes_written / kMB / elapsed; + (*level_stats)[LevelStatType::COMP_SEC] = stats.micros / kMicrosInSec; + (*level_stats)[LevelStatType::COMP_CPU_SEC] = stats.cpu_micros / kMicrosInSec; + (*level_stats)[LevelStatType::COMP_COUNT] = stats.count; + (*level_stats)[LevelStatType::AVG_SEC] = + stats.count == 0 ? 0 : stats.micros / kMicrosInSec / stats.count; + (*level_stats)[LevelStatType::KEY_IN] = + static_cast(stats.num_input_records); + (*level_stats)[LevelStatType::KEY_DROP] = + static_cast(stats.num_dropped_records); +} + +void PrintLevelStats(char* buf, size_t len, const std::string& name, + const std::map& stat_value) { + snprintf( + buf, len, + "%4s " /* Level */ + "%6d/%-3d " /* Files */ + "%8s " /* Size */ + "%5.1f " /* Score */ + "%8.1f " /* Read(GB) */ + "%7.1f " /* Rn(GB) */ + "%8.1f " /* Rnp1(GB) */ + "%9.1f " /* Write(GB) */ + "%8.1f " /* Wnew(GB) */ + "%9.1f " /* Moved(GB) */ + "%5.1f " /* W-Amp */ + "%8.1f " /* Rd(MB/s) */ + "%8.1f " /* Wr(MB/s) */ + "%9.2f " /* Comp(sec) */ + "%17.2f " /* CompMergeCPU(sec) */ + "%9d " /* Comp(cnt) */ + "%8.3f " /* Avg(sec) */ + "%7s " /* KeyIn */ + "%6s\n", /* KeyDrop */ + name.c_str(), static_cast(stat_value.at(LevelStatType::NUM_FILES)), + static_cast(stat_value.at(LevelStatType::COMPACTED_FILES)), + BytesToHumanString( + static_cast(stat_value.at(LevelStatType::SIZE_BYTES))) + .c_str(), + stat_value.at(LevelStatType::SCORE), + stat_value.at(LevelStatType::READ_GB), + stat_value.at(LevelStatType::RN_GB), + stat_value.at(LevelStatType::RNP1_GB), + stat_value.at(LevelStatType::WRITE_GB), + stat_value.at(LevelStatType::W_NEW_GB), + stat_value.at(LevelStatType::MOVED_GB), + stat_value.at(LevelStatType::WRITE_AMP), + stat_value.at(LevelStatType::READ_MBPS), + stat_value.at(LevelStatType::WRITE_MBPS), + stat_value.at(LevelStatType::COMP_SEC), + stat_value.at(LevelStatType::COMP_CPU_SEC), + static_cast(stat_value.at(LevelStatType::COMP_COUNT)), + stat_value.at(LevelStatType::AVG_SEC), + NumberToHumanString( + static_cast(stat_value.at(LevelStatType::KEY_IN))) + .c_str(), + NumberToHumanString( + static_cast(stat_value.at(LevelStatType::KEY_DROP))) + .c_str()); +} + +void PrintLevelStats(char* buf, size_t len, const std::string& name, + int num_files, int being_compacted, double total_file_size, + double score, double w_amp, + const InternalStats::CompactionStats& stats) { + std::map level_stats; + PrepareLevelStats(&level_stats, num_files, being_compacted, total_file_size, + score, w_amp, stats); + PrintLevelStats(buf, len, name, level_stats); +} + +// Assumes that trailing numbers represent an optional argument. This requires +// property names to not end with numbers. +std::pair GetPropertyNameAndArg(const Slice& property) { + Slice name = property, arg = property; + size_t sfx_len = 0; + while (sfx_len < property.size() && + isdigit(property[property.size() - sfx_len - 1])) { + ++sfx_len; + } + name.remove_suffix(sfx_len); + arg.remove_prefix(property.size() - sfx_len); + return {name, arg}; +} +} // anonymous namespace + +static const std::string rocksdb_prefix = "rocksdb."; + +static const std::string num_files_at_level_prefix = "num-files-at-level"; +static const std::string compression_ratio_at_level_prefix = + "compression-ratio-at-level"; +static const std::string allstats = "stats"; +static const std::string sstables = "sstables"; +static const std::string cfstats = "cfstats"; +static const std::string cfstats_no_file_histogram = + "cfstats-no-file-histogram"; +static const std::string cf_file_histogram = "cf-file-histogram"; +static const std::string dbstats = "dbstats"; +static const std::string levelstats = "levelstats"; +static const std::string num_immutable_mem_table = "num-immutable-mem-table"; +static const std::string num_immutable_mem_table_flushed = + "num-immutable-mem-table-flushed"; +static const std::string mem_table_flush_pending = "mem-table-flush-pending"; +static const std::string compaction_pending = "compaction-pending"; +static const std::string background_errors = "background-errors"; +static const std::string cur_size_active_mem_table = + "cur-size-active-mem-table"; +static const std::string cur_size_all_mem_tables = "cur-size-all-mem-tables"; +static const std::string size_all_mem_tables = "size-all-mem-tables"; +static const std::string num_entries_active_mem_table = + "num-entries-active-mem-table"; +static const std::string num_entries_imm_mem_tables = + "num-entries-imm-mem-tables"; +static const std::string num_deletes_active_mem_table = + "num-deletes-active-mem-table"; +static const std::string num_deletes_imm_mem_tables = + "num-deletes-imm-mem-tables"; +static const std::string estimate_num_keys = "estimate-num-keys"; +static const std::string estimate_table_readers_mem = + "estimate-table-readers-mem"; +static const std::string is_file_deletions_enabled = + "is-file-deletions-enabled"; +static const std::string num_snapshots = "num-snapshots"; +static const std::string oldest_snapshot_time = "oldest-snapshot-time"; +static const std::string oldest_snapshot_sequence = "oldest-snapshot-sequence"; +static const std::string num_live_versions = "num-live-versions"; +static const std::string current_version_number = + "current-super-version-number"; +static const std::string estimate_live_data_size = "estimate-live-data-size"; +static const std::string min_log_number_to_keep_str = "min-log-number-to-keep"; +static const std::string min_obsolete_sst_number_to_keep_str = + "min-obsolete-sst-number-to-keep"; +static const std::string base_level_str = "base-level"; +static const std::string total_sst_files_size = "total-sst-files-size"; +static const std::string live_sst_files_size = "live-sst-files-size"; +static const std::string estimate_pending_comp_bytes = + "estimate-pending-compaction-bytes"; +static const std::string aggregated_table_properties = + "aggregated-table-properties"; +static const std::string aggregated_table_properties_at_level = + aggregated_table_properties + "-at-level"; +static const std::string num_running_compactions = "num-running-compactions"; +static const std::string num_running_flushes = "num-running-flushes"; +static const std::string actual_delayed_write_rate = + "actual-delayed-write-rate"; +static const std::string is_write_stopped = "is-write-stopped"; +static const std::string estimate_oldest_key_time = "estimate-oldest-key-time"; +static const std::string block_cache_capacity = "block-cache-capacity"; +static const std::string block_cache_usage = "block-cache-usage"; +static const std::string block_cache_pinned_usage = "block-cache-pinned-usage"; +static const std::string options_statistics = "options-statistics"; + +const std::string DB::Properties::kNumFilesAtLevelPrefix = + rocksdb_prefix + num_files_at_level_prefix; +const std::string DB::Properties::kCompressionRatioAtLevelPrefix = + rocksdb_prefix + compression_ratio_at_level_prefix; +const std::string DB::Properties::kStats = rocksdb_prefix + allstats; +const std::string DB::Properties::kSSTables = rocksdb_prefix + sstables; +const std::string DB::Properties::kCFStats = rocksdb_prefix + cfstats; +const std::string DB::Properties::kCFStatsNoFileHistogram = + rocksdb_prefix + cfstats_no_file_histogram; +const std::string DB::Properties::kCFFileHistogram = + rocksdb_prefix + cf_file_histogram; +const std::string DB::Properties::kDBStats = rocksdb_prefix + dbstats; +const std::string DB::Properties::kLevelStats = rocksdb_prefix + levelstats; +const std::string DB::Properties::kNumImmutableMemTable = + rocksdb_prefix + num_immutable_mem_table; +const std::string DB::Properties::kNumImmutableMemTableFlushed = + rocksdb_prefix + num_immutable_mem_table_flushed; +const std::string DB::Properties::kMemTableFlushPending = + rocksdb_prefix + mem_table_flush_pending; +const std::string DB::Properties::kCompactionPending = + rocksdb_prefix + compaction_pending; +const std::string DB::Properties::kNumRunningCompactions = + rocksdb_prefix + num_running_compactions; +const std::string DB::Properties::kNumRunningFlushes = + rocksdb_prefix + num_running_flushes; +const std::string DB::Properties::kBackgroundErrors = + rocksdb_prefix + background_errors; +const std::string DB::Properties::kCurSizeActiveMemTable = + rocksdb_prefix + cur_size_active_mem_table; +const std::string DB::Properties::kCurSizeAllMemTables = + rocksdb_prefix + cur_size_all_mem_tables; +const std::string DB::Properties::kSizeAllMemTables = + rocksdb_prefix + size_all_mem_tables; +const std::string DB::Properties::kNumEntriesActiveMemTable = + rocksdb_prefix + num_entries_active_mem_table; +const std::string DB::Properties::kNumEntriesImmMemTables = + rocksdb_prefix + num_entries_imm_mem_tables; +const std::string DB::Properties::kNumDeletesActiveMemTable = + rocksdb_prefix + num_deletes_active_mem_table; +const std::string DB::Properties::kNumDeletesImmMemTables = + rocksdb_prefix + num_deletes_imm_mem_tables; +const std::string DB::Properties::kEstimateNumKeys = + rocksdb_prefix + estimate_num_keys; +const std::string DB::Properties::kEstimateTableReadersMem = + rocksdb_prefix + estimate_table_readers_mem; +const std::string DB::Properties::kIsFileDeletionsEnabled = + rocksdb_prefix + is_file_deletions_enabled; +const std::string DB::Properties::kNumSnapshots = + rocksdb_prefix + num_snapshots; +const std::string DB::Properties::kOldestSnapshotTime = + rocksdb_prefix + oldest_snapshot_time; +const std::string DB::Properties::kOldestSnapshotSequence = + rocksdb_prefix + oldest_snapshot_sequence; +const std::string DB::Properties::kNumLiveVersions = + rocksdb_prefix + num_live_versions; +const std::string DB::Properties::kCurrentSuperVersionNumber = + rocksdb_prefix + current_version_number; +const std::string DB::Properties::kEstimateLiveDataSize = + rocksdb_prefix + estimate_live_data_size; +const std::string DB::Properties::kMinLogNumberToKeep = + rocksdb_prefix + min_log_number_to_keep_str; +const std::string DB::Properties::kMinObsoleteSstNumberToKeep = + rocksdb_prefix + min_obsolete_sst_number_to_keep_str; +const std::string DB::Properties::kTotalSstFilesSize = + rocksdb_prefix + total_sst_files_size; +const std::string DB::Properties::kLiveSstFilesSize = + rocksdb_prefix + live_sst_files_size; +const std::string DB::Properties::kBaseLevel = rocksdb_prefix + base_level_str; +const std::string DB::Properties::kEstimatePendingCompactionBytes = + rocksdb_prefix + estimate_pending_comp_bytes; +const std::string DB::Properties::kAggregatedTableProperties = + rocksdb_prefix + aggregated_table_properties; +const std::string DB::Properties::kAggregatedTablePropertiesAtLevel = + rocksdb_prefix + aggregated_table_properties_at_level; +const std::string DB::Properties::kActualDelayedWriteRate = + rocksdb_prefix + actual_delayed_write_rate; +const std::string DB::Properties::kIsWriteStopped = + rocksdb_prefix + is_write_stopped; +const std::string DB::Properties::kEstimateOldestKeyTime = + rocksdb_prefix + estimate_oldest_key_time; +const std::string DB::Properties::kBlockCacheCapacity = + rocksdb_prefix + block_cache_capacity; +const std::string DB::Properties::kBlockCacheUsage = + rocksdb_prefix + block_cache_usage; +const std::string DB::Properties::kBlockCachePinnedUsage = + rocksdb_prefix + block_cache_pinned_usage; +const std::string DB::Properties::kOptionsStatistics = + rocksdb_prefix + options_statistics; + +const std::unordered_map + InternalStats::ppt_name_to_info = { + {DB::Properties::kNumFilesAtLevelPrefix, + {false, &InternalStats::HandleNumFilesAtLevel, nullptr, nullptr, + nullptr}}, + {DB::Properties::kCompressionRatioAtLevelPrefix, + {false, &InternalStats::HandleCompressionRatioAtLevelPrefix, nullptr, + nullptr, nullptr}}, + {DB::Properties::kLevelStats, + {false, &InternalStats::HandleLevelStats, nullptr, nullptr, nullptr}}, + {DB::Properties::kStats, + {false, &InternalStats::HandleStats, nullptr, nullptr, nullptr}}, + {DB::Properties::kCFStats, + {false, &InternalStats::HandleCFStats, nullptr, + &InternalStats::HandleCFMapStats, nullptr}}, + {DB::Properties::kCFStatsNoFileHistogram, + {false, &InternalStats::HandleCFStatsNoFileHistogram, nullptr, nullptr, + nullptr}}, + {DB::Properties::kCFFileHistogram, + {false, &InternalStats::HandleCFFileHistogram, nullptr, nullptr, + nullptr}}, + {DB::Properties::kDBStats, + {false, &InternalStats::HandleDBStats, nullptr, nullptr, nullptr}}, + {DB::Properties::kSSTables, + {false, &InternalStats::HandleSsTables, nullptr, nullptr, nullptr}}, + {DB::Properties::kAggregatedTableProperties, + {false, &InternalStats::HandleAggregatedTableProperties, nullptr, + nullptr, nullptr}}, + {DB::Properties::kAggregatedTablePropertiesAtLevel, + {false, &InternalStats::HandleAggregatedTablePropertiesAtLevel, + nullptr, nullptr, nullptr}}, + {DB::Properties::kNumImmutableMemTable, + {false, nullptr, &InternalStats::HandleNumImmutableMemTable, nullptr, + nullptr}}, + {DB::Properties::kNumImmutableMemTableFlushed, + {false, nullptr, &InternalStats::HandleNumImmutableMemTableFlushed, + nullptr, nullptr}}, + {DB::Properties::kMemTableFlushPending, + {false, nullptr, &InternalStats::HandleMemTableFlushPending, nullptr, + nullptr}}, + {DB::Properties::kCompactionPending, + {false, nullptr, &InternalStats::HandleCompactionPending, nullptr, + nullptr}}, + {DB::Properties::kBackgroundErrors, + {false, nullptr, &InternalStats::HandleBackgroundErrors, nullptr, + nullptr}}, + {DB::Properties::kCurSizeActiveMemTable, + {false, nullptr, &InternalStats::HandleCurSizeActiveMemTable, nullptr, + nullptr}}, + {DB::Properties::kCurSizeAllMemTables, + {false, nullptr, &InternalStats::HandleCurSizeAllMemTables, nullptr, + nullptr}}, + {DB::Properties::kSizeAllMemTables, + {false, nullptr, &InternalStats::HandleSizeAllMemTables, nullptr, + nullptr}}, + {DB::Properties::kNumEntriesActiveMemTable, + {false, nullptr, &InternalStats::HandleNumEntriesActiveMemTable, + nullptr, nullptr}}, + {DB::Properties::kNumEntriesImmMemTables, + {false, nullptr, &InternalStats::HandleNumEntriesImmMemTables, nullptr, + nullptr}}, + {DB::Properties::kNumDeletesActiveMemTable, + {false, nullptr, &InternalStats::HandleNumDeletesActiveMemTable, + nullptr, nullptr}}, + {DB::Properties::kNumDeletesImmMemTables, + {false, nullptr, &InternalStats::HandleNumDeletesImmMemTables, nullptr, + nullptr}}, + {DB::Properties::kEstimateNumKeys, + {false, nullptr, &InternalStats::HandleEstimateNumKeys, nullptr, + nullptr}}, + {DB::Properties::kEstimateTableReadersMem, + {true, nullptr, &InternalStats::HandleEstimateTableReadersMem, nullptr, + nullptr}}, + {DB::Properties::kIsFileDeletionsEnabled, + {false, nullptr, &InternalStats::HandleIsFileDeletionsEnabled, nullptr, + nullptr}}, + {DB::Properties::kNumSnapshots, + {false, nullptr, &InternalStats::HandleNumSnapshots, nullptr, + nullptr}}, + {DB::Properties::kOldestSnapshotTime, + {false, nullptr, &InternalStats::HandleOldestSnapshotTime, nullptr, + nullptr}}, + {DB::Properties::kOldestSnapshotSequence, + {false, nullptr, &InternalStats::HandleOldestSnapshotSequence, nullptr, + nullptr}}, + {DB::Properties::kNumLiveVersions, + {false, nullptr, &InternalStats::HandleNumLiveVersions, nullptr, + nullptr}}, + {DB::Properties::kCurrentSuperVersionNumber, + {false, nullptr, &InternalStats::HandleCurrentSuperVersionNumber, + nullptr, nullptr}}, + {DB::Properties::kEstimateLiveDataSize, + {true, nullptr, &InternalStats::HandleEstimateLiveDataSize, nullptr, + nullptr}}, + {DB::Properties::kMinLogNumberToKeep, + {false, nullptr, &InternalStats::HandleMinLogNumberToKeep, nullptr, + nullptr}}, + {DB::Properties::kMinObsoleteSstNumberToKeep, + {false, nullptr, &InternalStats::HandleMinObsoleteSstNumberToKeep, + nullptr, nullptr}}, + {DB::Properties::kBaseLevel, + {false, nullptr, &InternalStats::HandleBaseLevel, nullptr, nullptr}}, + {DB::Properties::kTotalSstFilesSize, + {false, nullptr, &InternalStats::HandleTotalSstFilesSize, nullptr, + nullptr}}, + {DB::Properties::kLiveSstFilesSize, + {false, nullptr, &InternalStats::HandleLiveSstFilesSize, nullptr, + nullptr}}, + {DB::Properties::kEstimatePendingCompactionBytes, + {false, nullptr, &InternalStats::HandleEstimatePendingCompactionBytes, + nullptr, nullptr}}, + {DB::Properties::kNumRunningFlushes, + {false, nullptr, &InternalStats::HandleNumRunningFlushes, nullptr, + nullptr}}, + {DB::Properties::kNumRunningCompactions, + {false, nullptr, &InternalStats::HandleNumRunningCompactions, nullptr, + nullptr}}, + {DB::Properties::kActualDelayedWriteRate, + {false, nullptr, &InternalStats::HandleActualDelayedWriteRate, nullptr, + nullptr}}, + {DB::Properties::kIsWriteStopped, + {false, nullptr, &InternalStats::HandleIsWriteStopped, nullptr, + nullptr}}, + {DB::Properties::kEstimateOldestKeyTime, + {false, nullptr, &InternalStats::HandleEstimateOldestKeyTime, nullptr, + nullptr}}, + {DB::Properties::kBlockCacheCapacity, + {false, nullptr, &InternalStats::HandleBlockCacheCapacity, nullptr, + nullptr}}, + {DB::Properties::kBlockCacheUsage, + {false, nullptr, &InternalStats::HandleBlockCacheUsage, nullptr, + nullptr}}, + {DB::Properties::kBlockCachePinnedUsage, + {false, nullptr, &InternalStats::HandleBlockCachePinnedUsage, nullptr, + nullptr}}, + {DB::Properties::kOptionsStatistics, + {false, nullptr, nullptr, nullptr, + &DBImpl::GetPropertyHandleOptionsStatistics}}, +}; + +const DBPropertyInfo* GetPropertyInfo(const Slice& property) { + std::string ppt_name = GetPropertyNameAndArg(property).first.ToString(); + auto ppt_info_iter = InternalStats::ppt_name_to_info.find(ppt_name); + if (ppt_info_iter == InternalStats::ppt_name_to_info.end()) { + return nullptr; + } + return &ppt_info_iter->second; +} + +bool InternalStats::GetStringProperty(const DBPropertyInfo& property_info, + const Slice& property, + std::string* value) { + assert(value != nullptr); + assert(property_info.handle_string != nullptr); + Slice arg = GetPropertyNameAndArg(property).second; + return (this->*(property_info.handle_string))(value, arg); +} + +bool InternalStats::GetMapProperty(const DBPropertyInfo& property_info, + const Slice& /*property*/, + std::map* value) { + assert(value != nullptr); + assert(property_info.handle_map != nullptr); + return (this->*(property_info.handle_map))(value); +} + +bool InternalStats::GetIntProperty(const DBPropertyInfo& property_info, + uint64_t* value, DBImpl* db) { + assert(value != nullptr); + assert(property_info.handle_int != nullptr && + !property_info.need_out_of_mutex); + db->mutex_.AssertHeld(); + return (this->*(property_info.handle_int))(value, db, nullptr /* version */); +} + +bool InternalStats::GetIntPropertyOutOfMutex( + const DBPropertyInfo& property_info, Version* version, uint64_t* value) { + assert(value != nullptr); + assert(property_info.handle_int != nullptr && + property_info.need_out_of_mutex); + return (this->*(property_info.handle_int))(value, nullptr /* db */, version); +} + +bool InternalStats::HandleNumFilesAtLevel(std::string* value, Slice suffix) { + uint64_t level; + const auto* vstorage = cfd_->current()->storage_info(); + bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty(); + if (!ok || static_cast(level) >= number_levels_) { + return false; + } else { + char buf[100]; + snprintf(buf, sizeof(buf), "%d", + vstorage->NumLevelFiles(static_cast(level))); + *value = buf; + return true; + } +} + +bool InternalStats::HandleCompressionRatioAtLevelPrefix(std::string* value, + Slice suffix) { + uint64_t level; + const auto* vstorage = cfd_->current()->storage_info(); + bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty(); + if (!ok || level >= static_cast(number_levels_)) { + return false; + } + *value = ToString( + vstorage->GetEstimatedCompressionRatioAtLevel(static_cast(level))); + return true; +} + +bool InternalStats::HandleLevelStats(std::string* value, Slice /*suffix*/) { + char buf[1000]; + const auto* vstorage = cfd_->current()->storage_info(); + snprintf(buf, sizeof(buf), + "Level Files Size(MB)\n" + "--------------------\n"); + value->append(buf); + + for (int level = 0; level < number_levels_; level++) { + snprintf(buf, sizeof(buf), "%3d %8d %8.0f\n", level, + vstorage->NumLevelFiles(level), + vstorage->NumLevelBytes(level) / kMB); + value->append(buf); + } + return true; +} + +bool InternalStats::HandleStats(std::string* value, Slice suffix) { + if (!HandleCFStats(value, suffix)) { + return false; + } + if (!HandleDBStats(value, suffix)) { + return false; + } + return true; +} + +bool InternalStats::HandleCFMapStats( + std::map* cf_stats) { + DumpCFMapStats(cf_stats); + return true; +} + +bool InternalStats::HandleCFStats(std::string* value, Slice /*suffix*/) { + DumpCFStats(value); + return true; +} + +bool InternalStats::HandleCFStatsNoFileHistogram(std::string* value, + Slice /*suffix*/) { + DumpCFStatsNoFileHistogram(value); + return true; +} + +bool InternalStats::HandleCFFileHistogram(std::string* value, + Slice /*suffix*/) { + DumpCFFileHistogram(value); + return true; +} + +bool InternalStats::HandleDBStats(std::string* value, Slice /*suffix*/) { + DumpDBStats(value); + return true; +} + +bool InternalStats::HandleSsTables(std::string* value, Slice /*suffix*/) { + auto* current = cfd_->current(); + *value = current->DebugString(true, true); + return true; +} + +bool InternalStats::HandleAggregatedTableProperties(std::string* value, + Slice /*suffix*/) { + std::shared_ptr tp; + auto s = cfd_->current()->GetAggregatedTableProperties(&tp); + if (!s.ok()) { + return false; + } + *value = tp->ToString(); + return true; +} + +bool InternalStats::HandleAggregatedTablePropertiesAtLevel(std::string* value, + Slice suffix) { + uint64_t level; + bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty(); + if (!ok || static_cast(level) >= number_levels_) { + return false; + } + std::shared_ptr tp; + auto s = cfd_->current()->GetAggregatedTableProperties( + &tp, static_cast(level)); + if (!s.ok()) { + return false; + } + *value = tp->ToString(); + return true; +} + +bool InternalStats::HandleNumImmutableMemTable(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + *value = cfd_->imm()->NumNotFlushed(); + return true; +} + +bool InternalStats::HandleNumImmutableMemTableFlushed(uint64_t* value, + DBImpl* /*db*/, + Version* /*version*/) { + *value = cfd_->imm()->NumFlushed(); + return true; +} + +bool InternalStats::HandleMemTableFlushPending(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + *value = (cfd_->imm()->IsFlushPending() ? 1 : 0); + return true; +} + +bool InternalStats::HandleNumRunningFlushes(uint64_t* value, DBImpl* db, + Version* /*version*/) { + *value = db->num_running_flushes(); + return true; +} + +bool InternalStats::HandleCompactionPending(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + // 1 if the system already determines at least one compaction is needed. + // 0 otherwise, + const auto* vstorage = cfd_->current()->storage_info(); + *value = (cfd_->compaction_picker()->NeedsCompaction(vstorage) ? 1 : 0); + return true; +} + +bool InternalStats::HandleNumRunningCompactions(uint64_t* value, DBImpl* db, + Version* /*version*/) { + *value = db->num_running_compactions_; + return true; +} + +bool InternalStats::HandleBackgroundErrors(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + // Accumulated number of errors in background flushes or compactions. + *value = GetBackgroundErrorCount(); + return true; +} + +bool InternalStats::HandleCurSizeActiveMemTable(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + // Current size of the active memtable + *value = cfd_->mem()->ApproximateMemoryUsage(); + return true; +} + +bool InternalStats::HandleCurSizeAllMemTables(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + // Current size of the active memtable + immutable memtables + *value = cfd_->mem()->ApproximateMemoryUsage() + + cfd_->imm()->ApproximateUnflushedMemTablesMemoryUsage(); + return true; +} + +bool InternalStats::HandleSizeAllMemTables(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + *value = cfd_->mem()->ApproximateMemoryUsage() + + cfd_->imm()->ApproximateMemoryUsage(); + return true; +} + +bool InternalStats::HandleNumEntriesActiveMemTable(uint64_t* value, + DBImpl* /*db*/, + Version* /*version*/) { + // Current number of entires in the active memtable + *value = cfd_->mem()->num_entries(); + return true; +} + +bool InternalStats::HandleNumEntriesImmMemTables(uint64_t* value, + DBImpl* /*db*/, + Version* /*version*/) { + // Current number of entries in the immutable memtables + *value = cfd_->imm()->current()->GetTotalNumEntries(); + return true; +} + +bool InternalStats::HandleNumDeletesActiveMemTable(uint64_t* value, + DBImpl* /*db*/, + Version* /*version*/) { + // Current number of entires in the active memtable + *value = cfd_->mem()->num_deletes(); + return true; +} + +bool InternalStats::HandleNumDeletesImmMemTables(uint64_t* value, + DBImpl* /*db*/, + Version* /*version*/) { + // Current number of entries in the immutable memtables + *value = cfd_->imm()->current()->GetTotalNumDeletes(); + return true; +} + +bool InternalStats::HandleEstimateNumKeys(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + // Estimate number of entries in the column family: + // Use estimated entries in tables + total entries in memtables. + const auto* vstorage = cfd_->current()->storage_info(); + uint64_t estimate_keys = cfd_->mem()->num_entries() + + cfd_->imm()->current()->GetTotalNumEntries() + + vstorage->GetEstimatedActiveKeys(); + uint64_t estimate_deletes = + cfd_->mem()->num_deletes() + cfd_->imm()->current()->GetTotalNumDeletes(); + *value = estimate_keys > estimate_deletes * 2 + ? estimate_keys - (estimate_deletes * 2) + : 0; + return true; +} + +bool InternalStats::HandleNumSnapshots(uint64_t* value, DBImpl* db, + Version* /*version*/) { + *value = db->snapshots().count(); + return true; +} + +bool InternalStats::HandleOldestSnapshotTime(uint64_t* value, DBImpl* db, + Version* /*version*/) { + *value = static_cast(db->snapshots().GetOldestSnapshotTime()); + return true; +} + +bool InternalStats::HandleOldestSnapshotSequence(uint64_t* value, DBImpl* db, + Version* /*version*/) { + *value = static_cast(db->snapshots().GetOldestSnapshotSequence()); + return true; +} + +bool InternalStats::HandleNumLiveVersions(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + *value = cfd_->GetNumLiveVersions(); + return true; +} + +bool InternalStats::HandleCurrentSuperVersionNumber(uint64_t* value, + DBImpl* /*db*/, + Version* /*version*/) { + *value = cfd_->GetSuperVersionNumber(); + return true; +} + +bool InternalStats::HandleIsFileDeletionsEnabled(uint64_t* value, DBImpl* db, + Version* /*version*/) { + *value = db->IsFileDeletionsEnabled(); + return true; +} + +bool InternalStats::HandleBaseLevel(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + const auto* vstorage = cfd_->current()->storage_info(); + *value = vstorage->base_level(); + return true; +} + +bool InternalStats::HandleTotalSstFilesSize(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + *value = cfd_->GetTotalSstFilesSize(); + return true; +} + +bool InternalStats::HandleLiveSstFilesSize(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + *value = cfd_->GetLiveSstFilesSize(); + return true; +} + +bool InternalStats::HandleEstimatePendingCompactionBytes(uint64_t* value, + DBImpl* /*db*/, + Version* /*version*/) { + const auto* vstorage = cfd_->current()->storage_info(); + *value = vstorage->estimated_compaction_needed_bytes(); + return true; +} + +bool InternalStats::HandleEstimateTableReadersMem(uint64_t* value, + DBImpl* /*db*/, + Version* version) { + *value = (version == nullptr) ? 0 : version->GetMemoryUsageByTableReaders(); + return true; +} + +bool InternalStats::HandleEstimateLiveDataSize(uint64_t* value, DBImpl* /*db*/, + Version* version) { + const auto* vstorage = version->storage_info(); + *value = vstorage->EstimateLiveDataSize(); + return true; +} + +bool InternalStats::HandleMinLogNumberToKeep(uint64_t* value, DBImpl* db, + Version* /*version*/) { + *value = db->MinLogNumberToKeep(); + return true; +} + +bool InternalStats::HandleMinObsoleteSstNumberToKeep(uint64_t* value, + DBImpl* db, + Version* /*version*/) { + *value = db->MinObsoleteSstNumberToKeep(); + return true; +} + +bool InternalStats::HandleActualDelayedWriteRate(uint64_t* value, DBImpl* db, + Version* /*version*/) { + const WriteController& wc = db->write_controller(); + if (!wc.NeedsDelay()) { + *value = 0; + } else { + *value = wc.delayed_write_rate(); + } + return true; +} + +bool InternalStats::HandleIsWriteStopped(uint64_t* value, DBImpl* db, + Version* /*version*/) { + *value = db->write_controller().IsStopped() ? 1 : 0; + return true; +} + +bool InternalStats::HandleEstimateOldestKeyTime(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + // TODO(yiwu): The property is currently available for fifo compaction + // with allow_compaction = false. This is because we don't propagate + // oldest_key_time on compaction. + if (cfd_->ioptions()->compaction_style != kCompactionStyleFIFO || + cfd_->GetCurrentMutableCFOptions() + ->compaction_options_fifo.allow_compaction) { + return false; + } + + TablePropertiesCollection collection; + auto s = cfd_->current()->GetPropertiesOfAllTables(&collection); + if (!s.ok()) { + return false; + } + *value = std::numeric_limits::max(); + for (auto& p : collection) { + *value = std::min(*value, p.second->oldest_key_time); + if (*value == 0) { + break; + } + } + if (*value > 0) { + *value = std::min({cfd_->mem()->ApproximateOldestKeyTime(), + cfd_->imm()->ApproximateOldestKeyTime(), *value}); + } + return *value > 0 && *value < std::numeric_limits::max(); +} + +bool InternalStats::HandleBlockCacheStat(Cache** block_cache) { + assert(block_cache != nullptr); + auto* table_factory = cfd_->ioptions()->table_factory; + assert(table_factory != nullptr); + if (BlockBasedTableFactory::kName != table_factory->Name()) { + return false; + } + auto* table_options = + reinterpret_cast(table_factory->GetOptions()); + if (table_options == nullptr) { + return false; + } + *block_cache = table_options->block_cache.get(); + if (table_options->no_block_cache || *block_cache == nullptr) { + return false; + } + return true; +} + +bool InternalStats::HandleBlockCacheCapacity(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + Cache* block_cache; + bool ok = HandleBlockCacheStat(&block_cache); + if (!ok) { + return false; + } + *value = static_cast(block_cache->GetCapacity()); + return true; +} + +bool InternalStats::HandleBlockCacheUsage(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + Cache* block_cache; + bool ok = HandleBlockCacheStat(&block_cache); + if (!ok) { + return false; + } + *value = static_cast(block_cache->GetUsage()); + return true; +} + +bool InternalStats::HandleBlockCachePinnedUsage(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + Cache* block_cache; + bool ok = HandleBlockCacheStat(&block_cache); + if (!ok) { + return false; + } + *value = static_cast(block_cache->GetPinnedUsage()); + return true; +} + +void InternalStats::DumpDBStats(std::string* value) { + char buf[1000]; + // DB-level stats, only available from default column family + double seconds_up = (env_->NowMicros() - started_at_ + 1) / kMicrosInSec; + double interval_seconds_up = seconds_up - db_stats_snapshot_.seconds_up; + snprintf(buf, sizeof(buf), + "\n** DB Stats **\nUptime(secs): %.1f total, %.1f interval\n", + seconds_up, interval_seconds_up); + value->append(buf); + // Cumulative + uint64_t user_bytes_written = + GetDBStats(InternalStats::kIntStatsBytesWritten); + uint64_t num_keys_written = + GetDBStats(InternalStats::kIntStatsNumKeysWritten); + uint64_t write_other = GetDBStats(InternalStats::kIntStatsWriteDoneByOther); + uint64_t write_self = GetDBStats(InternalStats::kIntStatsWriteDoneBySelf); + uint64_t wal_bytes = GetDBStats(InternalStats::kIntStatsWalFileBytes); + uint64_t wal_synced = GetDBStats(InternalStats::kIntStatsWalFileSynced); + uint64_t write_with_wal = GetDBStats(InternalStats::kIntStatsWriteWithWal); + uint64_t write_stall_micros = + GetDBStats(InternalStats::kIntStatsWriteStallMicros); + + const int kHumanMicrosLen = 32; + char human_micros[kHumanMicrosLen]; + + // Data + // writes: total number of write requests. + // keys: total number of key updates issued by all the write requests + // commit groups: number of group commits issued to the DB. Each group can + // contain one or more writes. + // so writes/keys is the average number of put in multi-put or put + // writes/groups is the average group commit size. + // + // The format is the same for interval stats. + snprintf(buf, sizeof(buf), + "Cumulative writes: %s writes, %s keys, %s commit groups, " + "%.1f writes per commit group, ingest: %.2f GB, %.2f MB/s\n", + NumberToHumanString(write_other + write_self).c_str(), + NumberToHumanString(num_keys_written).c_str(), + NumberToHumanString(write_self).c_str(), + (write_other + write_self) / static_cast(write_self + 1), + user_bytes_written / kGB, user_bytes_written / kMB / seconds_up); + value->append(buf); + // WAL + snprintf(buf, sizeof(buf), + "Cumulative WAL: %s writes, %s syncs, " + "%.2f writes per sync, written: %.2f GB, %.2f MB/s\n", + NumberToHumanString(write_with_wal).c_str(), + NumberToHumanString(wal_synced).c_str(), + write_with_wal / static_cast(wal_synced + 1), + wal_bytes / kGB, wal_bytes / kMB / seconds_up); + value->append(buf); + // Stall + AppendHumanMicros(write_stall_micros, human_micros, kHumanMicrosLen, true); + snprintf(buf, sizeof(buf), "Cumulative stall: %s, %.1f percent\n", + human_micros, + // 10000 = divide by 1M to get secs, then multiply by 100 for pct + write_stall_micros / 10000.0 / std::max(seconds_up, 0.001)); + value->append(buf); + + // Interval + uint64_t interval_write_other = write_other - db_stats_snapshot_.write_other; + uint64_t interval_write_self = write_self - db_stats_snapshot_.write_self; + uint64_t interval_num_keys_written = + num_keys_written - db_stats_snapshot_.num_keys_written; + snprintf( + buf, sizeof(buf), + "Interval writes: %s writes, %s keys, %s commit groups, " + "%.1f writes per commit group, ingest: %.2f MB, %.2f MB/s\n", + NumberToHumanString(interval_write_other + interval_write_self).c_str(), + NumberToHumanString(interval_num_keys_written).c_str(), + NumberToHumanString(interval_write_self).c_str(), + static_cast(interval_write_other + interval_write_self) / + (interval_write_self + 1), + (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB, + (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB / + std::max(interval_seconds_up, 0.001)), + value->append(buf); + + uint64_t interval_write_with_wal = + write_with_wal - db_stats_snapshot_.write_with_wal; + uint64_t interval_wal_synced = wal_synced - db_stats_snapshot_.wal_synced; + uint64_t interval_wal_bytes = wal_bytes - db_stats_snapshot_.wal_bytes; + + snprintf( + buf, sizeof(buf), + "Interval WAL: %s writes, %s syncs, " + "%.2f writes per sync, written: %.2f MB, %.2f MB/s\n", + NumberToHumanString(interval_write_with_wal).c_str(), + NumberToHumanString(interval_wal_synced).c_str(), + interval_write_with_wal / static_cast(interval_wal_synced + 1), + interval_wal_bytes / kGB, + interval_wal_bytes / kMB / std::max(interval_seconds_up, 0.001)); + value->append(buf); + + // Stall + AppendHumanMicros(write_stall_micros - db_stats_snapshot_.write_stall_micros, + human_micros, kHumanMicrosLen, true); + snprintf(buf, sizeof(buf), "Interval stall: %s, %.1f percent\n", human_micros, + // 10000 = divide by 1M to get secs, then multiply by 100 for pct + (write_stall_micros - db_stats_snapshot_.write_stall_micros) / + 10000.0 / std::max(interval_seconds_up, 0.001)); + value->append(buf); + + db_stats_snapshot_.seconds_up = seconds_up; + db_stats_snapshot_.ingest_bytes = user_bytes_written; + db_stats_snapshot_.write_other = write_other; + db_stats_snapshot_.write_self = write_self; + db_stats_snapshot_.num_keys_written = num_keys_written; + db_stats_snapshot_.wal_bytes = wal_bytes; + db_stats_snapshot_.wal_synced = wal_synced; + db_stats_snapshot_.write_with_wal = write_with_wal; + db_stats_snapshot_.write_stall_micros = write_stall_micros; +} + +/** + * Dump Compaction Level stats to a map of stat name with "compaction." prefix + * to value in double as string. The level in stat name is represented with + * a prefix "Lx" where "x" is the level number. A special level "Sum" + * represents the sum of a stat for all levels. + * The result also contains IO stall counters which keys start with "io_stalls." + * and values represent uint64 encoded as strings. + */ +void InternalStats::DumpCFMapStats( + std::map* cf_stats) { + CompactionStats compaction_stats_sum; + std::map> levels_stats; + DumpCFMapStats(&levels_stats, &compaction_stats_sum); + for (auto const& level_ent : levels_stats) { + auto level_str = + level_ent.first == -1 ? "Sum" : "L" + ToString(level_ent.first); + for (auto const& stat_ent : level_ent.second) { + auto stat_type = stat_ent.first; + auto key_str = + "compaction." + level_str + "." + + InternalStats::compaction_level_stats.at(stat_type).property_name; + (*cf_stats)[key_str] = std::to_string(stat_ent.second); + } + } + + DumpCFMapStatsIOStalls(cf_stats); +} + +void InternalStats::DumpCFMapStats( + std::map>* levels_stats, + CompactionStats* compaction_stats_sum) { + const VersionStorageInfo* vstorage = cfd_->current()->storage_info(); + + int num_levels_to_check = + (cfd_->ioptions()->compaction_style != kCompactionStyleFIFO) + ? vstorage->num_levels() - 1 + : 1; + + // Compaction scores are sorted based on its value. Restore them to the + // level order + std::vector compaction_score(number_levels_, 0); + for (int i = 0; i < num_levels_to_check; ++i) { + compaction_score[vstorage->CompactionScoreLevel(i)] = + vstorage->CompactionScore(i); + } + // Count # of files being compacted for each level + std::vector files_being_compacted(number_levels_, 0); + for (int level = 0; level < number_levels_; ++level) { + for (auto* f : vstorage->LevelFiles(level)) { + if (f->being_compacted) { + ++files_being_compacted[level]; + } + } + } + + int total_files = 0; + int total_files_being_compacted = 0; + double total_file_size = 0; + uint64_t flush_ingest = cf_stats_value_[BYTES_FLUSHED]; + uint64_t add_file_ingest = cf_stats_value_[BYTES_INGESTED_ADD_FILE]; + uint64_t curr_ingest = flush_ingest + add_file_ingest; + for (int level = 0; level < number_levels_; level++) { + int files = vstorage->NumLevelFiles(level); + total_files += files; + total_files_being_compacted += files_being_compacted[level]; + if (comp_stats_[level].micros > 0 || files > 0) { + compaction_stats_sum->Add(comp_stats_[level]); + total_file_size += vstorage->NumLevelBytes(level); + uint64_t input_bytes; + if (level == 0) { + input_bytes = curr_ingest; + } else { + input_bytes = comp_stats_[level].bytes_read_non_output_levels; + } + double w_amp = + (input_bytes == 0) + ? 0.0 + : static_cast(comp_stats_[level].bytes_written) / + input_bytes; + std::map level_stats; + PrepareLevelStats(&level_stats, files, files_being_compacted[level], + static_cast(vstorage->NumLevelBytes(level)), + compaction_score[level], w_amp, comp_stats_[level]); + (*levels_stats)[level] = level_stats; + } + } + // Cumulative summary + double w_amp = compaction_stats_sum->bytes_written / + static_cast(curr_ingest + 1); + // Stats summary across levels + std::map sum_stats; + PrepareLevelStats(&sum_stats, total_files, total_files_being_compacted, + total_file_size, 0, w_amp, *compaction_stats_sum); + (*levels_stats)[-1] = sum_stats; // -1 is for the Sum level +} + +void InternalStats::DumpCFMapStatsByPriority( + std::map>* priorities_stats) { + for (size_t priority = 0; priority < comp_stats_by_pri_.size(); priority++) { + if (comp_stats_by_pri_[priority].micros > 0) { + std::map priority_stats; + PrepareLevelStats(&priority_stats, 0 /* num_files */, + 0 /* being_compacted */, 0 /* total_file_size */, + 0 /* compaction_score */, 0 /* w_amp */, + comp_stats_by_pri_[priority]); + (*priorities_stats)[static_cast(priority)] = priority_stats; + } + } +} + +void InternalStats::DumpCFMapStatsIOStalls( + std::map* cf_stats) { + (*cf_stats)["io_stalls.level0_slowdown"] = + std::to_string(cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS]); + (*cf_stats)["io_stalls.level0_slowdown_with_compaction"] = + std::to_string(cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS]); + (*cf_stats)["io_stalls.level0_numfiles"] = + std::to_string(cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS]); + (*cf_stats)["io_stalls.level0_numfiles_with_compaction"] = + std::to_string(cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_STOPS]); + (*cf_stats)["io_stalls.stop_for_pending_compaction_bytes"] = + std::to_string(cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS]); + (*cf_stats)["io_stalls.slowdown_for_pending_compaction_bytes"] = + std::to_string(cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS]); + (*cf_stats)["io_stalls.memtable_compaction"] = + std::to_string(cf_stats_count_[MEMTABLE_LIMIT_STOPS]); + (*cf_stats)["io_stalls.memtable_slowdown"] = + std::to_string(cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS]); + + uint64_t total_stop = cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS] + + cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS] + + cf_stats_count_[MEMTABLE_LIMIT_STOPS]; + + uint64_t total_slowdown = + cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS] + + cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS] + + cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS]; + + (*cf_stats)["io_stalls.total_stop"] = std::to_string(total_stop); + (*cf_stats)["io_stalls.total_slowdown"] = std::to_string(total_slowdown); +} + +void InternalStats::DumpCFStats(std::string* value) { + DumpCFStatsNoFileHistogram(value); + DumpCFFileHistogram(value); +} + +void InternalStats::DumpCFStatsNoFileHistogram(std::string* value) { + char buf[2000]; + // Per-ColumnFamily stats + PrintLevelStatsHeader(buf, sizeof(buf), cfd_->GetName(), "Level"); + value->append(buf); + + // Print stats for each level + std::map> levels_stats; + CompactionStats compaction_stats_sum; + DumpCFMapStats(&levels_stats, &compaction_stats_sum); + for (int l = 0; l < number_levels_; ++l) { + if (levels_stats.find(l) != levels_stats.end()) { + PrintLevelStats(buf, sizeof(buf), "L" + ToString(l), levels_stats[l]); + value->append(buf); + } + } + + // Print sum of level stats + PrintLevelStats(buf, sizeof(buf), "Sum", levels_stats[-1]); + value->append(buf); + + uint64_t flush_ingest = cf_stats_value_[BYTES_FLUSHED]; + uint64_t add_file_ingest = cf_stats_value_[BYTES_INGESTED_ADD_FILE]; + uint64_t ingest_files_addfile = cf_stats_value_[INGESTED_NUM_FILES_TOTAL]; + uint64_t ingest_l0_files_addfile = + cf_stats_value_[INGESTED_LEVEL0_NUM_FILES_TOTAL]; + uint64_t ingest_keys_addfile = cf_stats_value_[INGESTED_NUM_KEYS_TOTAL]; + // Cumulative summary + uint64_t total_stall_count = + cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS] + + cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS] + + cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS] + + cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS] + + cf_stats_count_[MEMTABLE_LIMIT_STOPS] + + cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS]; + // Interval summary + uint64_t interval_flush_ingest = + flush_ingest - cf_stats_snapshot_.ingest_bytes_flush; + uint64_t interval_add_file_inget = + add_file_ingest - cf_stats_snapshot_.ingest_bytes_addfile; + uint64_t interval_ingest = + interval_flush_ingest + interval_add_file_inget + 1; + CompactionStats interval_stats(compaction_stats_sum); + interval_stats.Subtract(cf_stats_snapshot_.comp_stats); + double w_amp = + interval_stats.bytes_written / static_cast(interval_ingest); + PrintLevelStats(buf, sizeof(buf), "Int", 0, 0, 0, 0, w_amp, interval_stats); + value->append(buf); + + PrintLevelStatsHeader(buf, sizeof(buf), cfd_->GetName(), "Priority"); + value->append(buf); + std::map> priorities_stats; + DumpCFMapStatsByPriority(&priorities_stats); + for (size_t priority = 0; priority < comp_stats_by_pri_.size(); ++priority) { + if (priorities_stats.find(static_cast(priority)) != + priorities_stats.end()) { + PrintLevelStats( + buf, sizeof(buf), + Env::PriorityToString(static_cast(priority)), + priorities_stats[static_cast(priority)]); + value->append(buf); + } + } + + double seconds_up = (env_->NowMicros() - started_at_ + 1) / kMicrosInSec; + double interval_seconds_up = seconds_up - cf_stats_snapshot_.seconds_up; + snprintf(buf, sizeof(buf), "Uptime(secs): %.1f total, %.1f interval\n", + seconds_up, interval_seconds_up); + value->append(buf); + snprintf(buf, sizeof(buf), "Flush(GB): cumulative %.3f, interval %.3f\n", + flush_ingest / kGB, interval_flush_ingest / kGB); + value->append(buf); + snprintf(buf, sizeof(buf), "AddFile(GB): cumulative %.3f, interval %.3f\n", + add_file_ingest / kGB, interval_add_file_inget / kGB); + value->append(buf); + + uint64_t interval_ingest_files_addfile = + ingest_files_addfile - cf_stats_snapshot_.ingest_files_addfile; + snprintf(buf, sizeof(buf), + "AddFile(Total Files): cumulative %" PRIu64 ", interval %" PRIu64 + "\n", + ingest_files_addfile, interval_ingest_files_addfile); + value->append(buf); + + uint64_t interval_ingest_l0_files_addfile = + ingest_l0_files_addfile - cf_stats_snapshot_.ingest_l0_files_addfile; + snprintf(buf, sizeof(buf), + "AddFile(L0 Files): cumulative %" PRIu64 ", interval %" PRIu64 "\n", + ingest_l0_files_addfile, interval_ingest_l0_files_addfile); + value->append(buf); + + uint64_t interval_ingest_keys_addfile = + ingest_keys_addfile - cf_stats_snapshot_.ingest_keys_addfile; + snprintf(buf, sizeof(buf), + "AddFile(Keys): cumulative %" PRIu64 ", interval %" PRIu64 "\n", + ingest_keys_addfile, interval_ingest_keys_addfile); + value->append(buf); + + // Compact + uint64_t compact_bytes_read = 0; + uint64_t compact_bytes_write = 0; + uint64_t compact_micros = 0; + for (int level = 0; level < number_levels_; level++) { + compact_bytes_read += comp_stats_[level].bytes_read_output_level + + comp_stats_[level].bytes_read_non_output_levels; + compact_bytes_write += comp_stats_[level].bytes_written; + compact_micros += comp_stats_[level].micros; + } + + snprintf(buf, sizeof(buf), + "Cumulative compaction: %.2f GB write, %.2f MB/s write, " + "%.2f GB read, %.2f MB/s read, %.1f seconds\n", + compact_bytes_write / kGB, compact_bytes_write / kMB / seconds_up, + compact_bytes_read / kGB, compact_bytes_read / kMB / seconds_up, + compact_micros / kMicrosInSec); + value->append(buf); + + // Compaction interval + uint64_t interval_compact_bytes_write = + compact_bytes_write - cf_stats_snapshot_.compact_bytes_write; + uint64_t interval_compact_bytes_read = + compact_bytes_read - cf_stats_snapshot_.compact_bytes_read; + uint64_t interval_compact_micros = + compact_micros - cf_stats_snapshot_.compact_micros; + + snprintf( + buf, sizeof(buf), + "Interval compaction: %.2f GB write, %.2f MB/s write, " + "%.2f GB read, %.2f MB/s read, %.1f seconds\n", + interval_compact_bytes_write / kGB, + interval_compact_bytes_write / kMB / std::max(interval_seconds_up, 0.001), + interval_compact_bytes_read / kGB, + interval_compact_bytes_read / kMB / std::max(interval_seconds_up, 0.001), + interval_compact_micros / kMicrosInSec); + value->append(buf); + cf_stats_snapshot_.compact_bytes_write = compact_bytes_write; + cf_stats_snapshot_.compact_bytes_read = compact_bytes_read; + cf_stats_snapshot_.compact_micros = compact_micros; + + snprintf(buf, sizeof(buf), + "Stalls(count): %" PRIu64 + " level0_slowdown, " + "%" PRIu64 + " level0_slowdown_with_compaction, " + "%" PRIu64 + " level0_numfiles, " + "%" PRIu64 + " level0_numfiles_with_compaction, " + "%" PRIu64 + " stop for pending_compaction_bytes, " + "%" PRIu64 + " slowdown for pending_compaction_bytes, " + "%" PRIu64 + " memtable_compaction, " + "%" PRIu64 + " memtable_slowdown, " + "interval %" PRIu64 " total count\n", + cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS], + cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS], + cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS], + cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_STOPS], + cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS], + cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS], + cf_stats_count_[MEMTABLE_LIMIT_STOPS], + cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS], + total_stall_count - cf_stats_snapshot_.stall_count); + value->append(buf); + + cf_stats_snapshot_.seconds_up = seconds_up; + cf_stats_snapshot_.ingest_bytes_flush = flush_ingest; + cf_stats_snapshot_.ingest_bytes_addfile = add_file_ingest; + cf_stats_snapshot_.ingest_files_addfile = ingest_files_addfile; + cf_stats_snapshot_.ingest_l0_files_addfile = ingest_l0_files_addfile; + cf_stats_snapshot_.ingest_keys_addfile = ingest_keys_addfile; + cf_stats_snapshot_.comp_stats = compaction_stats_sum; + cf_stats_snapshot_.stall_count = total_stall_count; +} + +void InternalStats::DumpCFFileHistogram(std::string* value) { + char buf[2000]; + snprintf(buf, sizeof(buf), + "\n** File Read Latency Histogram By Level [%s] **\n", + cfd_->GetName().c_str()); + value->append(buf); + + for (int level = 0; level < number_levels_; level++) { + if (!file_read_latency_[level].Empty()) { + char buf2[5000]; + snprintf(buf2, sizeof(buf2), + "** Level %d read latency histogram (micros):\n%s\n", level, + file_read_latency_[level].ToString().c_str()); + value->append(buf2); + } + } +} + +#else + +const DBPropertyInfo* GetPropertyInfo(const Slice& /*property*/) { + return nullptr; +} + +#endif // !ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/internal_stats.h b/src/rocksdb/db/internal_stats.h new file mode 100644 index 000000000..ce83be244 --- /dev/null +++ b/src/rocksdb/db/internal_stats.h @@ -0,0 +1,697 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// + +#pragma once +#include +#include +#include + +#include "db/version_set.h" + +class ColumnFamilyData; + +namespace ROCKSDB_NAMESPACE { + +class DBImpl; +class MemTableList; + +// Config for retrieving a property's value. +struct DBPropertyInfo { + bool need_out_of_mutex; + + // gcc had an internal error for initializing union of pointer-to-member- + // functions. Workaround is to populate exactly one of the following function + // pointers with a non-nullptr value. + + // @param value Value-result argument for storing the property's string value + // @param suffix Argument portion of the property. For example, suffix would + // be "5" for the property "rocksdb.num-files-at-level5". So far, only + // certain string properties take an argument. + bool (InternalStats::*handle_string)(std::string* value, Slice suffix); + + // @param value Value-result argument for storing the property's uint64 value + // @param db Many of the int properties rely on DBImpl methods. + // @param version Version is needed in case the property is retrieved without + // holding db mutex, which is only supported for int properties. + bool (InternalStats::*handle_int)(uint64_t* value, DBImpl* db, + Version* version); + + // @param props Map of general properties to populate + bool (InternalStats::*handle_map)(std::map* props); + + // handle the string type properties rely on DBImpl methods + // @param value Value-result argument for storing the property's string value + bool (DBImpl::*handle_string_dbimpl)(std::string* value); +}; + +extern const DBPropertyInfo* GetPropertyInfo(const Slice& property); + +#ifndef ROCKSDB_LITE +#undef SCORE +enum class LevelStatType { + INVALID = 0, + NUM_FILES, + COMPACTED_FILES, + SIZE_BYTES, + SCORE, + READ_GB, + RN_GB, + RNP1_GB, + WRITE_GB, + W_NEW_GB, + MOVED_GB, + WRITE_AMP, + READ_MBPS, + WRITE_MBPS, + COMP_SEC, + COMP_CPU_SEC, + COMP_COUNT, + AVG_SEC, + KEY_IN, + KEY_DROP, + TOTAL // total number of types +}; + +struct LevelStat { + // This what will be L?.property_name in the flat map returned to the user + std::string property_name; + // This will be what we will print in the header in the cli + std::string header_name; +}; + +class InternalStats { + public: + static const std::map compaction_level_stats; + + enum InternalCFStatsType { + L0_FILE_COUNT_LIMIT_SLOWDOWNS, + LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS, + MEMTABLE_LIMIT_STOPS, + MEMTABLE_LIMIT_SLOWDOWNS, + L0_FILE_COUNT_LIMIT_STOPS, + LOCKED_L0_FILE_COUNT_LIMIT_STOPS, + PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS, + PENDING_COMPACTION_BYTES_LIMIT_STOPS, + WRITE_STALLS_ENUM_MAX, + BYTES_FLUSHED, + BYTES_INGESTED_ADD_FILE, + INGESTED_NUM_FILES_TOTAL, + INGESTED_LEVEL0_NUM_FILES_TOTAL, + INGESTED_NUM_KEYS_TOTAL, + INTERNAL_CF_STATS_ENUM_MAX, + }; + + enum InternalDBStatsType { + kIntStatsWalFileBytes, + kIntStatsWalFileSynced, + kIntStatsBytesWritten, + kIntStatsNumKeysWritten, + kIntStatsWriteDoneByOther, + kIntStatsWriteDoneBySelf, + kIntStatsWriteWithWal, + kIntStatsWriteStallMicros, + kIntStatsNumMax, + }; + + InternalStats(int num_levels, Env* env, ColumnFamilyData* cfd) + : db_stats_{}, + cf_stats_value_{}, + cf_stats_count_{}, + comp_stats_(num_levels), + comp_stats_by_pri_(Env::Priority::TOTAL), + file_read_latency_(num_levels), + bg_error_count_(0), + number_levels_(num_levels), + env_(env), + cfd_(cfd), + started_at_(env->NowMicros()) {} + + // Per level compaction stats. comp_stats_[level] stores the stats for + // compactions that produced data for the specified "level". + struct CompactionStats { + uint64_t micros; + uint64_t cpu_micros; + + // The number of bytes read from all non-output levels + uint64_t bytes_read_non_output_levels; + + // The number of bytes read from the compaction output level. + uint64_t bytes_read_output_level; + + // Total number of bytes written during compaction + uint64_t bytes_written; + + // Total number of bytes moved to the output level + uint64_t bytes_moved; + + // The number of compaction input files in all non-output levels. + int num_input_files_in_non_output_levels; + + // The number of compaction input files in the output level. + int num_input_files_in_output_level; + + // The number of compaction output files. + int num_output_files; + + // Total incoming entries during compaction between levels N and N+1 + uint64_t num_input_records; + + // Accumulated diff number of entries + // (num input entries - num output entires) for compaction levels N and N+1 + uint64_t num_dropped_records; + + // Number of compactions done + int count; + + // Number of compactions done per CompactionReason + int counts[static_cast(CompactionReason::kNumOfReasons)]; + + explicit CompactionStats() + : micros(0), + cpu_micros(0), + bytes_read_non_output_levels(0), + bytes_read_output_level(0), + bytes_written(0), + bytes_moved(0), + num_input_files_in_non_output_levels(0), + num_input_files_in_output_level(0), + num_output_files(0), + num_input_records(0), + num_dropped_records(0), + count(0) { + int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); + for (int i = 0; i < num_of_reasons; i++) { + counts[i] = 0; + } + } + + explicit CompactionStats(CompactionReason reason, int c) + : micros(0), + cpu_micros(0), + bytes_read_non_output_levels(0), + bytes_read_output_level(0), + bytes_written(0), + bytes_moved(0), + num_input_files_in_non_output_levels(0), + num_input_files_in_output_level(0), + num_output_files(0), + num_input_records(0), + num_dropped_records(0), + count(c) { + int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); + for (int i = 0; i < num_of_reasons; i++) { + counts[i] = 0; + } + int r = static_cast(reason); + if (r >= 0 && r < num_of_reasons) { + counts[r] = c; + } else { + count = 0; + } + } + + explicit CompactionStats(const CompactionStats& c) + : micros(c.micros), + cpu_micros(c.cpu_micros), + bytes_read_non_output_levels(c.bytes_read_non_output_levels), + bytes_read_output_level(c.bytes_read_output_level), + bytes_written(c.bytes_written), + bytes_moved(c.bytes_moved), + num_input_files_in_non_output_levels( + c.num_input_files_in_non_output_levels), + num_input_files_in_output_level(c.num_input_files_in_output_level), + num_output_files(c.num_output_files), + num_input_records(c.num_input_records), + num_dropped_records(c.num_dropped_records), + count(c.count) { + int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); + for (int i = 0; i < num_of_reasons; i++) { + counts[i] = c.counts[i]; + } + } + + CompactionStats& operator=(const CompactionStats& c) { + micros = c.micros; + cpu_micros = c.cpu_micros; + bytes_read_non_output_levels = c.bytes_read_non_output_levels; + bytes_read_output_level = c.bytes_read_output_level; + bytes_written = c.bytes_written; + bytes_moved = c.bytes_moved; + num_input_files_in_non_output_levels = + c.num_input_files_in_non_output_levels; + num_input_files_in_output_level = c.num_input_files_in_output_level; + num_output_files = c.num_output_files; + num_input_records = c.num_input_records; + num_dropped_records = c.num_dropped_records; + count = c.count; + + int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); + for (int i = 0; i < num_of_reasons; i++) { + counts[i] = c.counts[i]; + } + return *this; + } + + void Clear() { + this->micros = 0; + this->cpu_micros = 0; + this->bytes_read_non_output_levels = 0; + this->bytes_read_output_level = 0; + this->bytes_written = 0; + this->bytes_moved = 0; + this->num_input_files_in_non_output_levels = 0; + this->num_input_files_in_output_level = 0; + this->num_output_files = 0; + this->num_input_records = 0; + this->num_dropped_records = 0; + this->count = 0; + int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); + for (int i = 0; i < num_of_reasons; i++) { + counts[i] = 0; + } + } + + void Add(const CompactionStats& c) { + this->micros += c.micros; + this->cpu_micros += c.cpu_micros; + this->bytes_read_non_output_levels += c.bytes_read_non_output_levels; + this->bytes_read_output_level += c.bytes_read_output_level; + this->bytes_written += c.bytes_written; + this->bytes_moved += c.bytes_moved; + this->num_input_files_in_non_output_levels += + c.num_input_files_in_non_output_levels; + this->num_input_files_in_output_level += + c.num_input_files_in_output_level; + this->num_output_files += c.num_output_files; + this->num_input_records += c.num_input_records; + this->num_dropped_records += c.num_dropped_records; + this->count += c.count; + int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); + for (int i = 0; i< num_of_reasons; i++) { + counts[i] += c.counts[i]; + } + } + + void Subtract(const CompactionStats& c) { + this->micros -= c.micros; + this->cpu_micros -= c.cpu_micros; + this->bytes_read_non_output_levels -= c.bytes_read_non_output_levels; + this->bytes_read_output_level -= c.bytes_read_output_level; + this->bytes_written -= c.bytes_written; + this->bytes_moved -= c.bytes_moved; + this->num_input_files_in_non_output_levels -= + c.num_input_files_in_non_output_levels; + this->num_input_files_in_output_level -= + c.num_input_files_in_output_level; + this->num_output_files -= c.num_output_files; + this->num_input_records -= c.num_input_records; + this->num_dropped_records -= c.num_dropped_records; + this->count -= c.count; + int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); + for (int i = 0; i < num_of_reasons; i++) { + counts[i] -= c.counts[i]; + } + } + }; + + void Clear() { + for (int i = 0; i < kIntStatsNumMax; i++) { + db_stats_[i].store(0); + } + for (int i = 0; i < INTERNAL_CF_STATS_ENUM_MAX; i++) { + cf_stats_count_[i] = 0; + cf_stats_value_[i] = 0; + } + for (auto& comp_stat : comp_stats_) { + comp_stat.Clear(); + } + for (auto& h : file_read_latency_) { + h.Clear(); + } + cf_stats_snapshot_.Clear(); + db_stats_snapshot_.Clear(); + bg_error_count_ = 0; + started_at_ = env_->NowMicros(); + } + + void AddCompactionStats(int level, Env::Priority thread_pri, + const CompactionStats& stats) { + comp_stats_[level].Add(stats); + comp_stats_by_pri_[thread_pri].Add(stats); + } + + void IncBytesMoved(int level, uint64_t amount) { + comp_stats_[level].bytes_moved += amount; + } + + void AddCFStats(InternalCFStatsType type, uint64_t value) { + cf_stats_value_[type] += value; + ++cf_stats_count_[type]; + } + + void AddDBStats(InternalDBStatsType type, uint64_t value, + bool concurrent = false) { + auto& v = db_stats_[type]; + if (concurrent) { + v.fetch_add(value, std::memory_order_relaxed); + } else { + v.store(v.load(std::memory_order_relaxed) + value, + std::memory_order_relaxed); + } + } + + uint64_t GetDBStats(InternalDBStatsType type) { + return db_stats_[type].load(std::memory_order_relaxed); + } + + HistogramImpl* GetFileReadHist(int level) { + return &file_read_latency_[level]; + } + + uint64_t GetBackgroundErrorCount() const { return bg_error_count_; } + + uint64_t BumpAndGetBackgroundErrorCount() { return ++bg_error_count_; } + + bool GetStringProperty(const DBPropertyInfo& property_info, + const Slice& property, std::string* value); + + bool GetMapProperty(const DBPropertyInfo& property_info, + const Slice& property, + std::map* value); + + bool GetIntProperty(const DBPropertyInfo& property_info, uint64_t* value, + DBImpl* db); + + bool GetIntPropertyOutOfMutex(const DBPropertyInfo& property_info, + Version* version, uint64_t* value); + + const std::vector& TEST_GetCompactionStats() const { + return comp_stats_; + } + + // Store a mapping from the user-facing DB::Properties string to our + // DBPropertyInfo struct used internally for retrieving properties. + static const std::unordered_map ppt_name_to_info; + + private: + void DumpDBStats(std::string* value); + void DumpCFMapStats(std::map* cf_stats); + void DumpCFMapStats( + std::map>* level_stats, + CompactionStats* compaction_stats_sum); + void DumpCFMapStatsByPriority( + std::map>* priorities_stats); + void DumpCFMapStatsIOStalls(std::map* cf_stats); + void DumpCFStats(std::string* value); + void DumpCFStatsNoFileHistogram(std::string* value); + void DumpCFFileHistogram(std::string* value); + + bool HandleBlockCacheStat(Cache** block_cache); + + // Per-DB stats + std::atomic db_stats_[kIntStatsNumMax]; + // Per-ColumnFamily stats + uint64_t cf_stats_value_[INTERNAL_CF_STATS_ENUM_MAX]; + uint64_t cf_stats_count_[INTERNAL_CF_STATS_ENUM_MAX]; + // Per-ColumnFamily/level compaction stats + std::vector comp_stats_; + std::vector comp_stats_by_pri_; + std::vector file_read_latency_; + + // Used to compute per-interval statistics + struct CFStatsSnapshot { + // ColumnFamily-level stats + CompactionStats comp_stats; + uint64_t ingest_bytes_flush; // Bytes written to L0 (Flush) + uint64_t stall_count; // Stall count + // Stats from compaction jobs - bytes written, bytes read, duration. + uint64_t compact_bytes_write; + uint64_t compact_bytes_read; + uint64_t compact_micros; + double seconds_up; + + // AddFile specific stats + uint64_t ingest_bytes_addfile; // Total Bytes ingested + uint64_t ingest_files_addfile; // Total number of files ingested + uint64_t ingest_l0_files_addfile; // Total number of files ingested to L0 + uint64_t ingest_keys_addfile; // Total number of keys ingested + + CFStatsSnapshot() + : ingest_bytes_flush(0), + stall_count(0), + compact_bytes_write(0), + compact_bytes_read(0), + compact_micros(0), + seconds_up(0), + ingest_bytes_addfile(0), + ingest_files_addfile(0), + ingest_l0_files_addfile(0), + ingest_keys_addfile(0) {} + + void Clear() { + comp_stats.Clear(); + ingest_bytes_flush = 0; + stall_count = 0; + compact_bytes_write = 0; + compact_bytes_read = 0; + compact_micros = 0; + seconds_up = 0; + ingest_bytes_addfile = 0; + ingest_files_addfile = 0; + ingest_l0_files_addfile = 0; + ingest_keys_addfile = 0; + } + } cf_stats_snapshot_; + + struct DBStatsSnapshot { + // DB-level stats + uint64_t ingest_bytes; // Bytes written by user + uint64_t wal_bytes; // Bytes written to WAL + uint64_t wal_synced; // Number of times WAL is synced + uint64_t write_with_wal; // Number of writes that request WAL + // These count the number of writes processed by the calling thread or + // another thread. + uint64_t write_other; + uint64_t write_self; + // Total number of keys written. write_self and write_other measure number + // of write requests written, Each of the write request can contain updates + // to multiple keys. num_keys_written is total number of keys updated by all + // those writes. + uint64_t num_keys_written; + // Total time writes delayed by stalls. + uint64_t write_stall_micros; + double seconds_up; + + DBStatsSnapshot() + : ingest_bytes(0), + wal_bytes(0), + wal_synced(0), + write_with_wal(0), + write_other(0), + write_self(0), + num_keys_written(0), + write_stall_micros(0), + seconds_up(0) {} + + void Clear() { + ingest_bytes = 0; + wal_bytes = 0; + wal_synced = 0; + write_with_wal = 0; + write_other = 0; + write_self = 0; + num_keys_written = 0; + write_stall_micros = 0; + seconds_up = 0; + } + } db_stats_snapshot_; + + // Handler functions for getting property values. They use "value" as a value- + // result argument, and return true upon successfully setting "value". + bool HandleNumFilesAtLevel(std::string* value, Slice suffix); + bool HandleCompressionRatioAtLevelPrefix(std::string* value, Slice suffix); + bool HandleLevelStats(std::string* value, Slice suffix); + bool HandleStats(std::string* value, Slice suffix); + bool HandleCFMapStats(std::map* compaction_stats); + bool HandleCFStats(std::string* value, Slice suffix); + bool HandleCFStatsNoFileHistogram(std::string* value, Slice suffix); + bool HandleCFFileHistogram(std::string* value, Slice suffix); + bool HandleDBStats(std::string* value, Slice suffix); + bool HandleSsTables(std::string* value, Slice suffix); + bool HandleAggregatedTableProperties(std::string* value, Slice suffix); + bool HandleAggregatedTablePropertiesAtLevel(std::string* value, Slice suffix); + bool HandleNumImmutableMemTable(uint64_t* value, DBImpl* db, + Version* version); + bool HandleNumImmutableMemTableFlushed(uint64_t* value, DBImpl* db, + Version* version); + bool HandleMemTableFlushPending(uint64_t* value, DBImpl* db, + Version* version); + bool HandleNumRunningFlushes(uint64_t* value, DBImpl* db, Version* version); + bool HandleCompactionPending(uint64_t* value, DBImpl* db, Version* version); + bool HandleNumRunningCompactions(uint64_t* value, DBImpl* db, + Version* version); + bool HandleBackgroundErrors(uint64_t* value, DBImpl* db, Version* version); + bool HandleCurSizeActiveMemTable(uint64_t* value, DBImpl* db, + Version* version); + bool HandleCurSizeAllMemTables(uint64_t* value, DBImpl* db, Version* version); + bool HandleSizeAllMemTables(uint64_t* value, DBImpl* db, Version* version); + bool HandleNumEntriesActiveMemTable(uint64_t* value, DBImpl* db, + Version* version); + bool HandleNumEntriesImmMemTables(uint64_t* value, DBImpl* db, + Version* version); + bool HandleNumDeletesActiveMemTable(uint64_t* value, DBImpl* db, + Version* version); + bool HandleNumDeletesImmMemTables(uint64_t* value, DBImpl* db, + Version* version); + bool HandleEstimateNumKeys(uint64_t* value, DBImpl* db, Version* version); + bool HandleNumSnapshots(uint64_t* value, DBImpl* db, Version* version); + bool HandleOldestSnapshotTime(uint64_t* value, DBImpl* db, Version* version); + bool HandleOldestSnapshotSequence(uint64_t* value, DBImpl* db, + Version* version); + bool HandleNumLiveVersions(uint64_t* value, DBImpl* db, Version* version); + bool HandleCurrentSuperVersionNumber(uint64_t* value, DBImpl* db, + Version* version); + bool HandleIsFileDeletionsEnabled(uint64_t* value, DBImpl* db, + Version* version); + bool HandleBaseLevel(uint64_t* value, DBImpl* db, Version* version); + bool HandleTotalSstFilesSize(uint64_t* value, DBImpl* db, Version* version); + bool HandleLiveSstFilesSize(uint64_t* value, DBImpl* db, Version* version); + bool HandleEstimatePendingCompactionBytes(uint64_t* value, DBImpl* db, + Version* version); + bool HandleEstimateTableReadersMem(uint64_t* value, DBImpl* db, + Version* version); + bool HandleEstimateLiveDataSize(uint64_t* value, DBImpl* db, + Version* version); + bool HandleMinLogNumberToKeep(uint64_t* value, DBImpl* db, Version* version); + bool HandleMinObsoleteSstNumberToKeep(uint64_t* value, DBImpl* db, + Version* version); + bool HandleActualDelayedWriteRate(uint64_t* value, DBImpl* db, + Version* version); + bool HandleIsWriteStopped(uint64_t* value, DBImpl* db, Version* version); + bool HandleEstimateOldestKeyTime(uint64_t* value, DBImpl* db, + Version* version); + bool HandleBlockCacheCapacity(uint64_t* value, DBImpl* db, Version* version); + bool HandleBlockCacheUsage(uint64_t* value, DBImpl* db, Version* version); + bool HandleBlockCachePinnedUsage(uint64_t* value, DBImpl* db, + Version* version); + // Total number of background errors encountered. Every time a flush task + // or compaction task fails, this counter is incremented. The failure can + // be caused by any possible reason, including file system errors, out of + // resources, or input file corruption. Failing when retrying the same flush + // or compaction will cause the counter to increase too. + uint64_t bg_error_count_; + + const int number_levels_; + Env* env_; + ColumnFamilyData* cfd_; + uint64_t started_at_; +}; + +#else + +class InternalStats { + public: + enum InternalCFStatsType { + L0_FILE_COUNT_LIMIT_SLOWDOWNS, + LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS, + MEMTABLE_LIMIT_STOPS, + MEMTABLE_LIMIT_SLOWDOWNS, + L0_FILE_COUNT_LIMIT_STOPS, + LOCKED_L0_FILE_COUNT_LIMIT_STOPS, + PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS, + PENDING_COMPACTION_BYTES_LIMIT_STOPS, + WRITE_STALLS_ENUM_MAX, + BYTES_FLUSHED, + BYTES_INGESTED_ADD_FILE, + INGESTED_NUM_FILES_TOTAL, + INGESTED_LEVEL0_NUM_FILES_TOTAL, + INGESTED_NUM_KEYS_TOTAL, + INTERNAL_CF_STATS_ENUM_MAX, + }; + + enum InternalDBStatsType { + kIntStatsWalFileBytes, + kIntStatsWalFileSynced, + kIntStatsBytesWritten, + kIntStatsNumKeysWritten, + kIntStatsWriteDoneByOther, + kIntStatsWriteDoneBySelf, + kIntStatsWriteWithWal, + kIntStatsWriteStallMicros, + kIntStatsNumMax, + }; + + InternalStats(int /*num_levels*/, Env* /*env*/, ColumnFamilyData* /*cfd*/) {} + + struct CompactionStats { + uint64_t micros; + uint64_t cpu_micros; + uint64_t bytes_read_non_output_levels; + uint64_t bytes_read_output_level; + uint64_t bytes_written; + uint64_t bytes_moved; + int num_input_files_in_non_output_levels; + int num_input_files_in_output_level; + int num_output_files; + uint64_t num_input_records; + uint64_t num_dropped_records; + int count; + + explicit CompactionStats() {} + + explicit CompactionStats(CompactionReason /*reason*/, int /*c*/) {} + + explicit CompactionStats(const CompactionStats& /*c*/) {} + + void Add(const CompactionStats& /*c*/) {} + + void Subtract(const CompactionStats& /*c*/) {} + }; + + void AddCompactionStats(int /*level*/, Env::Priority /*thread_pri*/, + const CompactionStats& /*stats*/) {} + + void IncBytesMoved(int /*level*/, uint64_t /*amount*/) {} + + void AddCFStats(InternalCFStatsType /*type*/, uint64_t /*value*/) {} + + void AddDBStats(InternalDBStatsType /*type*/, uint64_t /*value*/, + bool /*concurrent */ = false) {} + + HistogramImpl* GetFileReadHist(int /*level*/) { return nullptr; } + + uint64_t GetBackgroundErrorCount() const { return 0; } + + uint64_t BumpAndGetBackgroundErrorCount() { return 0; } + + bool GetStringProperty(const DBPropertyInfo& /*property_info*/, + const Slice& /*property*/, std::string* /*value*/) { + return false; + } + + bool GetMapProperty(const DBPropertyInfo& /*property_info*/, + const Slice& /*property*/, + std::map* /*value*/) { + return false; + } + + bool GetIntProperty(const DBPropertyInfo& /*property_info*/, uint64_t* /*value*/, + DBImpl* /*db*/) const { + return false; + } + + bool GetIntPropertyOutOfMutex(const DBPropertyInfo& /*property_info*/, + Version* /*version*/, uint64_t* /*value*/) const { + return false; + } +}; +#endif // !ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/job_context.h b/src/rocksdb/db/job_context.h new file mode 100644 index 000000000..31ff26c3a --- /dev/null +++ b/src/rocksdb/db/job_context.h @@ -0,0 +1,219 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include + +#include "db/log_writer.h" +#include "db/column_family.h" + +namespace ROCKSDB_NAMESPACE { + +class MemTable; +struct SuperVersion; + +struct SuperVersionContext { + struct WriteStallNotification { + WriteStallInfo write_stall_info; + const ImmutableCFOptions* immutable_cf_options; + }; + + autovector superversions_to_free; +#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION + autovector write_stall_notifications; +#endif + std::unique_ptr + new_superversion; // if nullptr no new superversion + + explicit SuperVersionContext(bool create_superversion = false) + : new_superversion(create_superversion ? new SuperVersion() : nullptr) {} + + explicit SuperVersionContext(SuperVersionContext&& other) + : superversions_to_free(std::move(other.superversions_to_free)), +#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION + write_stall_notifications(std::move(other.write_stall_notifications)), +#endif + new_superversion(std::move(other.new_superversion)) { + } + + void NewSuperVersion() { + new_superversion = std::unique_ptr(new SuperVersion()); + } + + inline bool HaveSomethingToDelete() const { +#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION + return !superversions_to_free.empty() || + !write_stall_notifications.empty(); +#else + return !superversions_to_free.empty(); +#endif + } + + void PushWriteStallNotification( + WriteStallCondition old_cond, WriteStallCondition new_cond, + const std::string& name, const ImmutableCFOptions* ioptions) { +#if !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION) + WriteStallNotification notif; + notif.write_stall_info.cf_name = name; + notif.write_stall_info.condition.prev = old_cond; + notif.write_stall_info.condition.cur = new_cond; + notif.immutable_cf_options = ioptions; + write_stall_notifications.push_back(notif); +#else + (void)old_cond; + (void)new_cond; + (void)name; + (void)ioptions; +#endif // !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION) + } + + void Clean() { +#if !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION) + // notify listeners on changed write stall conditions + for (auto& notif : write_stall_notifications) { + for (auto& listener : notif.immutable_cf_options->listeners) { + listener->OnStallConditionsChanged(notif.write_stall_info); + } + } + write_stall_notifications.clear(); +#endif // !ROCKSDB_LITE + // free superversions + for (auto s : superversions_to_free) { + delete s; + } + superversions_to_free.clear(); + } + + ~SuperVersionContext() { +#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION + assert(write_stall_notifications.empty()); +#endif + assert(superversions_to_free.empty()); + } +}; + +struct JobContext { + inline bool HaveSomethingToDelete() const { + return full_scan_candidate_files.size() || sst_delete_files.size() || + log_delete_files.size() || manifest_delete_files.size(); + } + + inline bool HaveSomethingToClean() const { + bool sv_have_sth = false; + for (const auto& sv_ctx : superversion_contexts) { + if (sv_ctx.HaveSomethingToDelete()) { + sv_have_sth = true; + break; + } + } + return memtables_to_free.size() > 0 || logs_to_free.size() > 0 || + sv_have_sth; + } + + // Structure to store information for candidate files to delete. + struct CandidateFileInfo { + std::string file_name; + std::string file_path; + CandidateFileInfo(std::string name, std::string path) + : file_name(std::move(name)), file_path(std::move(path)) {} + bool operator==(const CandidateFileInfo& other) const { + return file_name == other.file_name && + file_path == other.file_path; + } + }; + + // Unique job id + int job_id; + + // a list of all files that we'll consider deleting + // (every once in a while this is filled up with all files + // in the DB directory) + // (filled only if we're doing full scan) + std::vector full_scan_candidate_files; + + // the list of all live sst files that cannot be deleted + std::vector sst_live; + + // a list of sst files that we need to delete + std::vector sst_delete_files; + + // a list of log files that we need to delete + std::vector log_delete_files; + + // a list of log files that we need to preserve during full purge since they + // will be reused later + std::vector log_recycle_files; + + // a list of manifest files that we need to delete + std::vector manifest_delete_files; + + // a list of memtables to be free + autovector memtables_to_free; + + // contexts for installing superversions for multiple column families + std::vector superversion_contexts; + + autovector logs_to_free; + + // the current manifest_file_number, log_number and prev_log_number + // that corresponds to the set of files in 'live'. + uint64_t manifest_file_number; + uint64_t pending_manifest_file_number; + uint64_t log_number; + uint64_t prev_log_number; + + uint64_t min_pending_output = 0; + uint64_t prev_total_log_size = 0; + size_t num_alive_log_files = 0; + uint64_t size_log_to_delete = 0; + + // Snapshot taken before flush/compaction job. + std::unique_ptr job_snapshot; + + explicit JobContext(int _job_id, bool create_superversion = false) { + job_id = _job_id; + manifest_file_number = 0; + pending_manifest_file_number = 0; + log_number = 0; + prev_log_number = 0; + superversion_contexts.emplace_back( + SuperVersionContext(create_superversion)); + } + + // For non-empty JobContext Clean() has to be called at least once before + // before destruction (see asserts in ~JobContext()). Should be called with + // unlocked DB mutex. Destructor doesn't call Clean() to avoid accidentally + // doing potentially slow Clean() with locked DB mutex. + void Clean() { + // free superversions + for (auto& sv_context : superversion_contexts) { + sv_context.Clean(); + } + // free pending memtables + for (auto m : memtables_to_free) { + delete m; + } + for (auto l : logs_to_free) { + delete l; + } + + memtables_to_free.clear(); + logs_to_free.clear(); + job_snapshot.reset(); + } + + ~JobContext() { + assert(memtables_to_free.size() == 0); + assert(logs_to_free.size() == 0); + } +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/listener_test.cc b/src/rocksdb/db/listener_test.cc new file mode 100644 index 000000000..eb1a08a35 --- /dev/null +++ b/src/rocksdb/db/listener_test.cc @@ -0,0 +1,1042 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob_index.h" +#include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" +#include "db/dbformat.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "file/filename.h" +#include "logging/logging.h" +#include "memtable/hash_linklist_rep.h" +#include "monitoring/statistics.h" +#include "rocksdb/cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/options.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/plain/plain_table_factory.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/hash.h" +#include "util/mutexlock.h" +#include "util/rate_limiter.h" +#include "util/string_util.h" +#include "utilities/merge_operators.h" + +#ifndef ROCKSDB_LITE + +namespace ROCKSDB_NAMESPACE { + +class EventListenerTest : public DBTestBase { + public: + EventListenerTest() : DBTestBase("/listener_test") {} + + static std::string BlobStr(uint64_t blob_file_number, uint64_t offset, + uint64_t size) { + std::string blob_index; + BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size, + kNoCompression); + return blob_index; + } + + const size_t k110KB = 110 << 10; +}; + +struct TestPropertiesCollector + : public ROCKSDB_NAMESPACE::TablePropertiesCollector { + ROCKSDB_NAMESPACE::Status AddUserKey( + const ROCKSDB_NAMESPACE::Slice& /*key*/, + const ROCKSDB_NAMESPACE::Slice& /*value*/, + ROCKSDB_NAMESPACE::EntryType /*type*/, + ROCKSDB_NAMESPACE::SequenceNumber /*seq*/, + uint64_t /*file_size*/) override { + return Status::OK(); + } + ROCKSDB_NAMESPACE::Status Finish( + ROCKSDB_NAMESPACE::UserCollectedProperties* properties) override { + properties->insert({"0", "1"}); + return Status::OK(); + } + + const char* Name() const override { return "TestTablePropertiesCollector"; } + + ROCKSDB_NAMESPACE::UserCollectedProperties GetReadableProperties() + const override { + ROCKSDB_NAMESPACE::UserCollectedProperties ret; + ret["2"] = "3"; + return ret; + } +}; + +class TestPropertiesCollectorFactory : public TablePropertiesCollectorFactory { + public: + TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context /*context*/) override { + return new TestPropertiesCollector; + } + const char* Name() const override { return "TestTablePropertiesCollector"; } +}; + +class TestCompactionListener : public EventListener { + public: + explicit TestCompactionListener(EventListenerTest* test) : test_(test) {} + + void OnCompactionCompleted(DB *db, const CompactionJobInfo& ci) override { + std::lock_guard lock(mutex_); + compacted_dbs_.push_back(db); + ASSERT_GT(ci.input_files.size(), 0U); + ASSERT_EQ(ci.input_files.size(), ci.input_file_infos.size()); + + for (size_t i = 0; i < ci.input_file_infos.size(); ++i) { + ASSERT_EQ(ci.input_file_infos[i].level, ci.base_input_level); + ASSERT_EQ(ci.input_file_infos[i].file_number, + TableFileNameToNumber(ci.input_files[i])); + } + + ASSERT_GT(ci.output_files.size(), 0U); + ASSERT_EQ(ci.output_files.size(), ci.output_file_infos.size()); + + ASSERT_TRUE(test_); + ASSERT_EQ(test_->db_, db); + + std::vector> files_by_level; + test_->dbfull()->TEST_GetFilesMetaData(test_->handles_[ci.cf_id], + &files_by_level); + ASSERT_GT(files_by_level.size(), ci.output_level); + + for (size_t i = 0; i < ci.output_file_infos.size(); ++i) { + ASSERT_EQ(ci.output_file_infos[i].level, ci.output_level); + ASSERT_EQ(ci.output_file_infos[i].file_number, + TableFileNameToNumber(ci.output_files[i])); + + auto it = std::find_if( + files_by_level[ci.output_level].begin(), + files_by_level[ci.output_level].end(), [&](const FileMetaData& meta) { + return meta.fd.GetNumber() == ci.output_file_infos[i].file_number; + }); + ASSERT_NE(it, files_by_level[ci.output_level].end()); + + ASSERT_EQ(ci.output_file_infos[i].oldest_blob_file_number, + it->oldest_blob_file_number); + } + + ASSERT_EQ(db->GetEnv()->GetThreadID(), ci.thread_id); + ASSERT_GT(ci.thread_id, 0U); + + for (auto fl : {ci.input_files, ci.output_files}) { + for (auto fn : fl) { + auto it = ci.table_properties.find(fn); + ASSERT_NE(it, ci.table_properties.end()); + auto tp = it->second; + ASSERT_TRUE(tp != nullptr); + ASSERT_EQ(tp->user_collected_properties.find("0")->second, "1"); + } + } + } + + EventListenerTest* test_; + std::vector compacted_dbs_; + std::mutex mutex_; +}; + +TEST_F(EventListenerTest, OnSingleDBCompactionTest) { + const int kTestKeySize = 16; + const int kTestValueSize = 984; + const int kEntrySize = kTestKeySize + kTestValueSize; + const int kEntriesPerBuffer = 100; + const int kNumL0Files = 4; + + Options options; + options.env = CurrentOptions().env; + options.create_if_missing = true; + options.write_buffer_size = kEntrySize * kEntriesPerBuffer; + options.compaction_style = kCompactionStyleLevel; + options.target_file_size_base = options.write_buffer_size; + options.max_bytes_for_level_base = options.target_file_size_base * 2; + options.max_bytes_for_level_multiplier = 2; + options.compression = kNoCompression; +#ifdef ROCKSDB_USING_THREAD_STATUS + options.enable_thread_tracking = true; +#endif // ROCKSDB_USING_THREAD_STATUS + options.level0_file_num_compaction_trigger = kNumL0Files; + options.table_properties_collector_factories.push_back( + std::make_shared()); + + TestCompactionListener* listener = new TestCompactionListener(this); + options.listeners.emplace_back(listener); + std::vector cf_names = { + "pikachu", "ilya", "muromec", "dobrynia", + "nikitich", "alyosha", "popovich"}; + CreateAndReopenWithCF(cf_names, options); + ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p'))); + + WriteBatch batch; + ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 1, "ditto", + BlobStr(123, 0, 1 << 10))); + ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); + + ASSERT_OK(Put(2, "ilya", std::string(90000, 'i'))); + ASSERT_OK(Put(3, "muromec", std::string(90000, 'm'))); + ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd'))); + ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n'))); + ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a'))); + ASSERT_OK(Put(7, "popovich", std::string(90000, 'p'))); + for (int i = 1; i < 8; ++i) { + ASSERT_OK(Flush(i)); + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[i], + nullptr, nullptr)); + dbfull()->TEST_WaitForCompact(); + } + + ASSERT_EQ(listener->compacted_dbs_.size(), cf_names.size()); + for (size_t i = 0; i < cf_names.size(); ++i) { + ASSERT_EQ(listener->compacted_dbs_[i], db_); + } +} + +// This simple Listener can only handle one flush at a time. +class TestFlushListener : public EventListener { + public: + TestFlushListener(Env* env, EventListenerTest* test) + : slowdown_count(0), stop_count(0), db_closed(), env_(env), test_(test) { + db_closed = false; + } + void OnTableFileCreated( + const TableFileCreationInfo& info) override { + // remember the info for later checking the FlushJobInfo. + prev_fc_info_ = info; + ASSERT_GT(info.db_name.size(), 0U); + ASSERT_GT(info.cf_name.size(), 0U); + ASSERT_GT(info.file_path.size(), 0U); + ASSERT_GT(info.job_id, 0); + ASSERT_GT(info.table_properties.data_size, 0U); + ASSERT_GT(info.table_properties.raw_key_size, 0U); + ASSERT_GT(info.table_properties.raw_value_size, 0U); + ASSERT_GT(info.table_properties.num_data_blocks, 0U); + ASSERT_GT(info.table_properties.num_entries, 0U); + +#ifdef ROCKSDB_USING_THREAD_STATUS + // Verify the id of the current thread that created this table + // file matches the id of any active flush or compaction thread. + uint64_t thread_id = env_->GetThreadID(); + std::vector thread_list; + ASSERT_OK(env_->GetThreadList(&thread_list)); + bool found_match = false; + for (auto thread_status : thread_list) { + if (thread_status.operation_type == ThreadStatus::OP_FLUSH || + thread_status.operation_type == ThreadStatus::OP_COMPACTION) { + if (thread_id == thread_status.thread_id) { + found_match = true; + break; + } + } + } + ASSERT_TRUE(found_match); +#endif // ROCKSDB_USING_THREAD_STATUS + } + + void OnFlushCompleted( + DB* db, const FlushJobInfo& info) override { + flushed_dbs_.push_back(db); + flushed_column_family_names_.push_back(info.cf_name); + if (info.triggered_writes_slowdown) { + slowdown_count++; + } + if (info.triggered_writes_stop) { + stop_count++; + } + // verify whether the previously created file matches the flushed file. + ASSERT_EQ(prev_fc_info_.db_name, db->GetName()); + ASSERT_EQ(prev_fc_info_.cf_name, info.cf_name); + ASSERT_EQ(prev_fc_info_.job_id, info.job_id); + ASSERT_EQ(prev_fc_info_.file_path, info.file_path); + ASSERT_EQ(TableFileNameToNumber(info.file_path), info.file_number); + + // Note: the following chunk relies on the notification pertaining to the + // database pointed to by DBTestBase::db_, and is thus bypassed when + // that assumption does not hold (see the test case MultiDBMultiListeners + // below). + ASSERT_TRUE(test_); + if (db == test_->db_) { + std::vector> files_by_level; + test_->dbfull()->TEST_GetFilesMetaData(test_->handles_[info.cf_id], + &files_by_level); + + ASSERT_FALSE(files_by_level.empty()); + auto it = std::find_if(files_by_level[0].begin(), files_by_level[0].end(), + [&](const FileMetaData& meta) { + return meta.fd.GetNumber() == info.file_number; + }); + ASSERT_NE(it, files_by_level[0].end()); + ASSERT_EQ(info.oldest_blob_file_number, it->oldest_blob_file_number); + } + + ASSERT_EQ(db->GetEnv()->GetThreadID(), info.thread_id); + ASSERT_GT(info.thread_id, 0U); + ASSERT_EQ(info.table_properties.user_collected_properties.find("0")->second, + "1"); + } + + std::vector flushed_column_family_names_; + std::vector flushed_dbs_; + int slowdown_count; + int stop_count; + bool db_closing; + std::atomic_bool db_closed; + TableFileCreationInfo prev_fc_info_; + + protected: + Env* env_; + EventListenerTest* test_; +}; + +TEST_F(EventListenerTest, OnSingleDBFlushTest) { + Options options; + options.env = CurrentOptions().env; + options.write_buffer_size = k110KB; +#ifdef ROCKSDB_USING_THREAD_STATUS + options.enable_thread_tracking = true; +#endif // ROCKSDB_USING_THREAD_STATUS + TestFlushListener* listener = new TestFlushListener(options.env, this); + options.listeners.emplace_back(listener); + std::vector cf_names = { + "pikachu", "ilya", "muromec", "dobrynia", + "nikitich", "alyosha", "popovich"}; + options.table_properties_collector_factories.push_back( + std::make_shared()); + CreateAndReopenWithCF(cf_names, options); + + ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p'))); + + WriteBatch batch; + ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 1, "ditto", + BlobStr(456, 0, 1 << 10))); + ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); + + ASSERT_OK(Put(2, "ilya", std::string(90000, 'i'))); + ASSERT_OK(Put(3, "muromec", std::string(90000, 'm'))); + ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd'))); + ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n'))); + ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a'))); + ASSERT_OK(Put(7, "popovich", std::string(90000, 'p'))); + for (int i = 1; i < 8; ++i) { + ASSERT_OK(Flush(i)); + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(listener->flushed_dbs_.size(), i); + ASSERT_EQ(listener->flushed_column_family_names_.size(), i); + } + + // make sure callback functions are called in the right order + for (size_t i = 0; i < cf_names.size(); ++i) { + ASSERT_EQ(listener->flushed_dbs_[i], db_); + ASSERT_EQ(listener->flushed_column_family_names_[i], cf_names[i]); + } +} + +TEST_F(EventListenerTest, MultiCF) { + Options options; + options.env = CurrentOptions().env; + options.write_buffer_size = k110KB; +#ifdef ROCKSDB_USING_THREAD_STATUS + options.enable_thread_tracking = true; +#endif // ROCKSDB_USING_THREAD_STATUS + TestFlushListener* listener = new TestFlushListener(options.env, this); + options.listeners.emplace_back(listener); + options.table_properties_collector_factories.push_back( + std::make_shared()); + std::vector cf_names = { + "pikachu", "ilya", "muromec", "dobrynia", + "nikitich", "alyosha", "popovich"}; + CreateAndReopenWithCF(cf_names, options); + + ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p'))); + ASSERT_OK(Put(2, "ilya", std::string(90000, 'i'))); + ASSERT_OK(Put(3, "muromec", std::string(90000, 'm'))); + ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd'))); + ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n'))); + ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a'))); + ASSERT_OK(Put(7, "popovich", std::string(90000, 'p'))); + for (int i = 1; i < 8; ++i) { + ASSERT_OK(Flush(i)); + ASSERT_EQ(listener->flushed_dbs_.size(), i); + ASSERT_EQ(listener->flushed_column_family_names_.size(), i); + } + + // make sure callback functions are called in the right order + for (size_t i = 0; i < cf_names.size(); i++) { + ASSERT_EQ(listener->flushed_dbs_[i], db_); + ASSERT_EQ(listener->flushed_column_family_names_[i], cf_names[i]); + } +} + +TEST_F(EventListenerTest, MultiDBMultiListeners) { + Options options; + options.env = CurrentOptions().env; +#ifdef ROCKSDB_USING_THREAD_STATUS + options.enable_thread_tracking = true; +#endif // ROCKSDB_USING_THREAD_STATUS + options.table_properties_collector_factories.push_back( + std::make_shared()); + std::vector listeners; + const int kNumDBs = 5; + const int kNumListeners = 10; + for (int i = 0; i < kNumListeners; ++i) { + listeners.emplace_back(new TestFlushListener(options.env, this)); + } + + std::vector cf_names = { + "pikachu", "ilya", "muromec", "dobrynia", + "nikitich", "alyosha", "popovich"}; + + options.create_if_missing = true; + for (int i = 0; i < kNumListeners; ++i) { + options.listeners.emplace_back(listeners[i]); + } + DBOptions db_opts(options); + ColumnFamilyOptions cf_opts(options); + + std::vector dbs; + std::vector> vec_handles; + + for (int d = 0; d < kNumDBs; ++d) { + ASSERT_OK(DestroyDB(dbname_ + ToString(d), options)); + DB* db; + std::vector handles; + ASSERT_OK(DB::Open(options, dbname_ + ToString(d), &db)); + for (size_t c = 0; c < cf_names.size(); ++c) { + ColumnFamilyHandle* handle; + db->CreateColumnFamily(cf_opts, cf_names[c], &handle); + handles.push_back(handle); + } + + vec_handles.push_back(std::move(handles)); + dbs.push_back(db); + } + + for (int d = 0; d < kNumDBs; ++d) { + for (size_t c = 0; c < cf_names.size(); ++c) { + ASSERT_OK(dbs[d]->Put(WriteOptions(), vec_handles[d][c], + cf_names[c], cf_names[c])); + } + } + + for (size_t c = 0; c < cf_names.size(); ++c) { + for (int d = 0; d < kNumDBs; ++d) { + ASSERT_OK(dbs[d]->Flush(FlushOptions(), vec_handles[d][c])); + reinterpret_cast(dbs[d])->TEST_WaitForFlushMemTable(); + } + } + + for (auto* listener : listeners) { + int pos = 0; + for (size_t c = 0; c < cf_names.size(); ++c) { + for (int d = 0; d < kNumDBs; ++d) { + ASSERT_EQ(listener->flushed_dbs_[pos], dbs[d]); + ASSERT_EQ(listener->flushed_column_family_names_[pos], cf_names[c]); + pos++; + } + } + } + + + for (auto handles : vec_handles) { + for (auto h : handles) { + delete h; + } + handles.clear(); + } + vec_handles.clear(); + + for (auto db : dbs) { + delete db; + } +} + +TEST_F(EventListenerTest, DisableBGCompaction) { + Options options; + options.env = CurrentOptions().env; +#ifdef ROCKSDB_USING_THREAD_STATUS + options.enable_thread_tracking = true; +#endif // ROCKSDB_USING_THREAD_STATUS + TestFlushListener* listener = new TestFlushListener(options.env, this); + const int kCompactionTrigger = 1; + const int kSlowdownTrigger = 5; + const int kStopTrigger = 100; + options.level0_file_num_compaction_trigger = kCompactionTrigger; + options.level0_slowdown_writes_trigger = kSlowdownTrigger; + options.level0_stop_writes_trigger = kStopTrigger; + options.max_write_buffer_number = 10; + options.listeners.emplace_back(listener); + // BG compaction is disabled. Number of L0 files will simply keeps + // increasing in this test. + options.compaction_style = kCompactionStyleNone; + options.compression = kNoCompression; + options.write_buffer_size = 100000; // Small write buffer + options.table_properties_collector_factories.push_back( + std::make_shared()); + + CreateAndReopenWithCF({"pikachu"}, options); + ColumnFamilyMetaData cf_meta; + db_->GetColumnFamilyMetaData(handles_[1], &cf_meta); + + // keep writing until writes are forced to stop. + for (int i = 0; static_cast(cf_meta.file_count) < kSlowdownTrigger * 10; + ++i) { + Put(1, ToString(i), std::string(10000, 'x'), WriteOptions()); + FlushOptions fo; + fo.allow_write_stall = true; + db_->Flush(fo, handles_[1]); + db_->GetColumnFamilyMetaData(handles_[1], &cf_meta); + } + ASSERT_GE(listener->slowdown_count, kSlowdownTrigger * 9); +} + +class TestCompactionReasonListener : public EventListener { + public: + void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override { + std::lock_guard lock(mutex_); + compaction_reasons_.push_back(ci.compaction_reason); + } + + std::vector compaction_reasons_; + std::mutex mutex_; +}; + +TEST_F(EventListenerTest, CompactionReasonLevel) { + Options options; + options.env = CurrentOptions().env; + options.create_if_missing = true; + options.memtable_factory.reset( + new SpecialSkipListFactory(DBTestBase::kNumKeysByGenerateNewRandomFile)); + + TestCompactionReasonListener* listener = new TestCompactionReasonListener(); + options.listeners.emplace_back(listener); + + options.level0_file_num_compaction_trigger = 4; + options.compaction_style = kCompactionStyleLevel; + + DestroyAndReopen(options); + Random rnd(301); + + // Write 4 files in L0 + for (int i = 0; i < 4; i++) { + GenerateNewRandomFile(&rnd); + } + dbfull()->TEST_WaitForCompact(); + + ASSERT_EQ(listener->compaction_reasons_.size(), 1); + ASSERT_EQ(listener->compaction_reasons_[0], + CompactionReason::kLevelL0FilesNum); + + DestroyAndReopen(options); + + // Write 3 non-overlapping files in L0 + for (int k = 1; k <= 30; k++) { + ASSERT_OK(Put(Key(k), Key(k))); + if (k % 10 == 0) { + Flush(); + } + } + + // Do a trivial move from L0 -> L1 + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + + options.max_bytes_for_level_base = 1; + Close(); + listener->compaction_reasons_.clear(); + Reopen(options); + + dbfull()->TEST_WaitForCompact(); + ASSERT_GT(listener->compaction_reasons_.size(), 1); + + for (auto compaction_reason : listener->compaction_reasons_) { + ASSERT_EQ(compaction_reason, CompactionReason::kLevelMaxLevelSize); + } + + options.disable_auto_compactions = true; + Close(); + listener->compaction_reasons_.clear(); + Reopen(options); + + Put("key", "value"); + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_GT(listener->compaction_reasons_.size(), 0); + for (auto compaction_reason : listener->compaction_reasons_) { + ASSERT_EQ(compaction_reason, CompactionReason::kManualCompaction); + } +} + +TEST_F(EventListenerTest, CompactionReasonUniversal) { + Options options; + options.env = CurrentOptions().env; + options.create_if_missing = true; + options.memtable_factory.reset( + new SpecialSkipListFactory(DBTestBase::kNumKeysByGenerateNewRandomFile)); + + TestCompactionReasonListener* listener = new TestCompactionReasonListener(); + options.listeners.emplace_back(listener); + + options.compaction_style = kCompactionStyleUniversal; + + Random rnd(301); + + options.level0_file_num_compaction_trigger = 8; + options.compaction_options_universal.max_size_amplification_percent = 100000; + options.compaction_options_universal.size_ratio = 100000; + DestroyAndReopen(options); + listener->compaction_reasons_.clear(); + + // Write 8 files in L0 + for (int i = 0; i < 8; i++) { + GenerateNewRandomFile(&rnd); + } + dbfull()->TEST_WaitForCompact(); + + ASSERT_GT(listener->compaction_reasons_.size(), 0); + for (auto compaction_reason : listener->compaction_reasons_) { + ASSERT_EQ(compaction_reason, CompactionReason::kUniversalSizeRatio); + } + + options.level0_file_num_compaction_trigger = 8; + options.compaction_options_universal.max_size_amplification_percent = 1; + options.compaction_options_universal.size_ratio = 100000; + + DestroyAndReopen(options); + listener->compaction_reasons_.clear(); + + // Write 8 files in L0 + for (int i = 0; i < 8; i++) { + GenerateNewRandomFile(&rnd); + } + dbfull()->TEST_WaitForCompact(); + + ASSERT_GT(listener->compaction_reasons_.size(), 0); + for (auto compaction_reason : listener->compaction_reasons_) { + ASSERT_EQ(compaction_reason, CompactionReason::kUniversalSizeAmplification); + } + + options.disable_auto_compactions = true; + Close(); + listener->compaction_reasons_.clear(); + Reopen(options); + + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + + ASSERT_GT(listener->compaction_reasons_.size(), 0); + for (auto compaction_reason : listener->compaction_reasons_) { + ASSERT_EQ(compaction_reason, CompactionReason::kManualCompaction); + } +} + +TEST_F(EventListenerTest, CompactionReasonFIFO) { + Options options; + options.env = CurrentOptions().env; + options.create_if_missing = true; + options.memtable_factory.reset( + new SpecialSkipListFactory(DBTestBase::kNumKeysByGenerateNewRandomFile)); + + TestCompactionReasonListener* listener = new TestCompactionReasonListener(); + options.listeners.emplace_back(listener); + + options.level0_file_num_compaction_trigger = 4; + options.compaction_style = kCompactionStyleFIFO; + options.compaction_options_fifo.max_table_files_size = 1; + + DestroyAndReopen(options); + Random rnd(301); + + // Write 4 files in L0 + for (int i = 0; i < 4; i++) { + GenerateNewRandomFile(&rnd); + } + dbfull()->TEST_WaitForCompact(); + + ASSERT_GT(listener->compaction_reasons_.size(), 0); + for (auto compaction_reason : listener->compaction_reasons_) { + ASSERT_EQ(compaction_reason, CompactionReason::kFIFOMaxSize); + } +} + +class TableFileCreationListener : public EventListener { + public: + class TestEnv : public EnvWrapper { + public: + TestEnv() : EnvWrapper(Env::Default()) {} + + void SetStatus(Status s) { status_ = s; } + + Status NewWritableFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) override { + if (fname.size() > 4 && fname.substr(fname.size() - 4) == ".sst") { + if (!status_.ok()) { + return status_; + } + } + return Env::Default()->NewWritableFile(fname, result, options); + } + + private: + Status status_; + }; + + TableFileCreationListener() { + for (int i = 0; i < 2; i++) { + started_[i] = finished_[i] = failure_[i] = 0; + } + } + + int Index(TableFileCreationReason reason) { + int idx; + switch (reason) { + case TableFileCreationReason::kFlush: + idx = 0; + break; + case TableFileCreationReason::kCompaction: + idx = 1; + break; + default: + idx = -1; + } + return idx; + } + + void CheckAndResetCounters(int flush_started, int flush_finished, + int flush_failure, int compaction_started, + int compaction_finished, int compaction_failure) { + ASSERT_EQ(started_[0], flush_started); + ASSERT_EQ(finished_[0], flush_finished); + ASSERT_EQ(failure_[0], flush_failure); + ASSERT_EQ(started_[1], compaction_started); + ASSERT_EQ(finished_[1], compaction_finished); + ASSERT_EQ(failure_[1], compaction_failure); + for (int i = 0; i < 2; i++) { + started_[i] = finished_[i] = failure_[i] = 0; + } + } + + void OnTableFileCreationStarted( + const TableFileCreationBriefInfo& info) override { + int idx = Index(info.reason); + if (idx >= 0) { + started_[idx]++; + } + ASSERT_GT(info.db_name.size(), 0U); + ASSERT_GT(info.cf_name.size(), 0U); + ASSERT_GT(info.file_path.size(), 0U); + ASSERT_GT(info.job_id, 0); + } + + void OnTableFileCreated(const TableFileCreationInfo& info) override { + int idx = Index(info.reason); + if (idx >= 0) { + finished_[idx]++; + } + ASSERT_GT(info.db_name.size(), 0U); + ASSERT_GT(info.cf_name.size(), 0U); + ASSERT_GT(info.file_path.size(), 0U); + ASSERT_GT(info.job_id, 0); + if (info.status.ok()) { + ASSERT_GT(info.table_properties.data_size, 0U); + ASSERT_GT(info.table_properties.raw_key_size, 0U); + ASSERT_GT(info.table_properties.raw_value_size, 0U); + ASSERT_GT(info.table_properties.num_data_blocks, 0U); + ASSERT_GT(info.table_properties.num_entries, 0U); + } else { + if (idx >= 0) { + failure_[idx]++; + } + } + } + + TestEnv test_env; + int started_[2]; + int finished_[2]; + int failure_[2]; +}; + +TEST_F(EventListenerTest, TableFileCreationListenersTest) { + auto listener = std::make_shared(); + Options options; + options.create_if_missing = true; + options.listeners.push_back(listener); + options.env = &listener->test_env; + DestroyAndReopen(options); + + ASSERT_OK(Put("foo", "aaa")); + ASSERT_OK(Put("bar", "bbb")); + ASSERT_OK(Flush()); + dbfull()->TEST_WaitForFlushMemTable(); + listener->CheckAndResetCounters(1, 1, 0, 0, 0, 0); + + ASSERT_OK(Put("foo", "aaa1")); + ASSERT_OK(Put("bar", "bbb1")); + listener->test_env.SetStatus(Status::NotSupported("not supported")); + ASSERT_NOK(Flush()); + listener->CheckAndResetCounters(1, 1, 1, 0, 0, 0); + listener->test_env.SetStatus(Status::OK()); + + Reopen(options); + ASSERT_OK(Put("foo", "aaa2")); + ASSERT_OK(Put("bar", "bbb2")); + ASSERT_OK(Flush()); + dbfull()->TEST_WaitForFlushMemTable(); + listener->CheckAndResetCounters(1, 1, 0, 0, 0, 0); + + const Slice kRangeStart = "a"; + const Slice kRangeEnd = "z"; + dbfull()->CompactRange(CompactRangeOptions(), &kRangeStart, &kRangeEnd); + dbfull()->TEST_WaitForCompact(); + listener->CheckAndResetCounters(0, 0, 0, 1, 1, 0); + + ASSERT_OK(Put("foo", "aaa3")); + ASSERT_OK(Put("bar", "bbb3")); + ASSERT_OK(Flush()); + listener->test_env.SetStatus(Status::NotSupported("not supported")); + dbfull()->CompactRange(CompactRangeOptions(), &kRangeStart, &kRangeEnd); + dbfull()->TEST_WaitForCompact(); + listener->CheckAndResetCounters(1, 1, 0, 1, 1, 1); +} + +class MemTableSealedListener : public EventListener { +private: + SequenceNumber latest_seq_number_; +public: + MemTableSealedListener() {} + void OnMemTableSealed(const MemTableInfo& info) override { + latest_seq_number_ = info.first_seqno; + } + + void OnFlushCompleted(DB* /*db*/, + const FlushJobInfo& flush_job_info) override { + ASSERT_LE(flush_job_info.smallest_seqno, latest_seq_number_); + } +}; + +TEST_F(EventListenerTest, MemTableSealedListenerTest) { + auto listener = std::make_shared(); + Options options; + options.create_if_missing = true; + options.listeners.push_back(listener); + DestroyAndReopen(options); + + for (unsigned int i = 0; i < 10; i++) { + std::string tag = std::to_string(i); + ASSERT_OK(Put("foo"+tag, "aaa")); + ASSERT_OK(Put("bar"+tag, "bbb")); + + ASSERT_OK(Flush()); + } +} + +class ColumnFamilyHandleDeletionStartedListener : public EventListener { + private: + std::vector cfs_; + int counter; + + public: + explicit ColumnFamilyHandleDeletionStartedListener( + const std::vector& cfs) + : cfs_(cfs), counter(0) { + cfs_.insert(cfs_.begin(), kDefaultColumnFamilyName); + } + void OnColumnFamilyHandleDeletionStarted( + ColumnFamilyHandle* handle) override { + ASSERT_EQ(cfs_[handle->GetID()], handle->GetName()); + counter++; + } + int getCounter() { return counter; } +}; + +TEST_F(EventListenerTest, ColumnFamilyHandleDeletionStartedListenerTest) { + std::vector cfs{"pikachu", "eevee", "Mewtwo"}; + auto listener = + std::make_shared(cfs); + Options options; + options.env = CurrentOptions().env; + options.create_if_missing = true; + options.listeners.push_back(listener); + CreateAndReopenWithCF(cfs, options); + ASSERT_EQ(handles_.size(), 4); + delete handles_[3]; + delete handles_[2]; + delete handles_[1]; + handles_.resize(1); + ASSERT_EQ(listener->getCounter(), 3); +} + +class BackgroundErrorListener : public EventListener { + private: + SpecialEnv* env_; + int counter_; + + public: + BackgroundErrorListener(SpecialEnv* env) : env_(env), counter_(0) {} + + void OnBackgroundError(BackgroundErrorReason /*reason*/, + Status* bg_error) override { + if (counter_ == 0) { + // suppress the first error and disable write-dropping such that a retry + // can succeed. + *bg_error = Status::OK(); + env_->drop_writes_.store(false, std::memory_order_release); + env_->no_slowdown_ = false; + } + ++counter_; + } + + int counter() { return counter_; } +}; + +TEST_F(EventListenerTest, BackgroundErrorListenerFailedFlushTest) { + auto listener = std::make_shared(env_); + Options options; + options.create_if_missing = true; + options.env = env_; + options.listeners.push_back(listener); + options.memtable_factory.reset(new SpecialSkipListFactory(1)); + options.paranoid_checks = true; + DestroyAndReopen(options); + + // the usual TEST_WaitForFlushMemTable() doesn't work for failed flushes, so + // forge a custom one for the failed flush case. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BGWorkFlush:done", + "EventListenerTest:BackgroundErrorListenerFailedFlushTest:1"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + env_->drop_writes_.store(true, std::memory_order_release); + env_->no_slowdown_ = true; + + ASSERT_OK(Put("key0", "val")); + ASSERT_OK(Put("key1", "val")); + TEST_SYNC_POINT("EventListenerTest:BackgroundErrorListenerFailedFlushTest:1"); + ASSERT_EQ(1, listener->counter()); + ASSERT_OK(Put("key2", "val")); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); +} + +TEST_F(EventListenerTest, BackgroundErrorListenerFailedCompactionTest) { + auto listener = std::make_shared(env_); + Options options; + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.env = env_; + options.level0_file_num_compaction_trigger = 2; + options.listeners.push_back(listener); + options.memtable_factory.reset(new SpecialSkipListFactory(2)); + options.paranoid_checks = true; + DestroyAndReopen(options); + + // third iteration triggers the second memtable's flush + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put("key0", "val")); + if (i > 0) { + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + ASSERT_OK(Put("key1", "val")); + } + ASSERT_EQ(2, NumTableFilesAtLevel(0)); + + env_->drop_writes_.store(true, std::memory_order_release); + env_->no_slowdown_ = true; + ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}})); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(1, listener->counter()); + + // trigger flush so compaction is triggered again; this time it succeeds + // The previous failed compaction may get retried automatically, so we may + // be left with 0 or 1 files in level 1, depending on when the retry gets + // scheduled + ASSERT_OK(Put("key0", "val")); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_LE(1, NumTableFilesAtLevel(0)); +} + +class TestFileOperationListener : public EventListener { + public: + TestFileOperationListener() { + file_reads_.store(0); + file_reads_success_.store(0); + file_writes_.store(0); + file_writes_success_.store(0); + } + + void OnFileReadFinish(const FileOperationInfo& info) override { + ++file_reads_; + if (info.status.ok()) { + ++file_reads_success_; + } + ReportDuration(info); + } + + void OnFileWriteFinish(const FileOperationInfo& info) override { + ++file_writes_; + if (info.status.ok()) { + ++file_writes_success_; + } + ReportDuration(info); + } + + bool ShouldBeNotifiedOnFileIO() override { return true; } + + std::atomic file_reads_; + std::atomic file_reads_success_; + std::atomic file_writes_; + std::atomic file_writes_success_; + + private: + void ReportDuration(const FileOperationInfo& info) const { + auto duration = std::chrono::duration_cast( + info.finish_timestamp - info.start_timestamp); + ASSERT_GT(duration.count(), 0); + } +}; + +TEST_F(EventListenerTest, OnFileOperationTest) { + Options options; + options.env = CurrentOptions().env; + options.create_if_missing = true; + + TestFileOperationListener* listener = new TestFileOperationListener(); + options.listeners.emplace_back(listener); + + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "aaa")); + dbfull()->Flush(FlushOptions()); + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_GE(listener->file_writes_.load(), + listener->file_writes_success_.load()); + ASSERT_GT(listener->file_writes_.load(), 0); + Close(); + + Reopen(options); + ASSERT_GE(listener->file_reads_.load(), listener->file_reads_success_.load()); + ASSERT_GT(listener->file_reads_.load(), 0); +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/log_format.h b/src/rocksdb/db/log_format.h new file mode 100644 index 000000000..c22e2b6bc --- /dev/null +++ b/src/rocksdb/db/log_format.h @@ -0,0 +1,48 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Log format information shared by reader and writer. +// See ../doc/log_format.txt for more detail. + +#pragma once + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +namespace log { + +enum RecordType { + // Zero is reserved for preallocated files + kZeroType = 0, + kFullType = 1, + + // For fragments + kFirstType = 2, + kMiddleType = 3, + kLastType = 4, + + // For recycled log files + kRecyclableFullType = 5, + kRecyclableFirstType = 6, + kRecyclableMiddleType = 7, + kRecyclableLastType = 8, +}; +static const int kMaxRecordType = kRecyclableLastType; + +static const unsigned int kBlockSize = 32768; + +// Header is checksum (4 bytes), length (2 bytes), type (1 byte) +static const int kHeaderSize = 4 + 2 + 1; + +// Recyclable header is checksum (4 bytes), length (2 bytes), type (1 byte), +// log number (4 bytes). +static const int kRecyclableHeaderSize = 4 + 2 + 1 + 4; + +} // namespace log +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/log_reader.cc b/src/rocksdb/db/log_reader.cc new file mode 100644 index 000000000..c60a814b9 --- /dev/null +++ b/src/rocksdb/db/log_reader.cc @@ -0,0 +1,624 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/log_reader.h" + +#include +#include "file/sequence_file_reader.h" +#include "rocksdb/env.h" +#include "test_util/sync_point.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/util.h" + +namespace ROCKSDB_NAMESPACE { +namespace log { + +Reader::Reporter::~Reporter() { +} + +Reader::Reader(std::shared_ptr info_log, + std::unique_ptr&& _file, + Reporter* reporter, bool checksum, uint64_t log_num) + : info_log_(info_log), + file_(std::move(_file)), + reporter_(reporter), + checksum_(checksum), + backing_store_(new char[kBlockSize]), + buffer_(), + eof_(false), + read_error_(false), + eof_offset_(0), + last_record_offset_(0), + end_of_buffer_offset_(0), + log_number_(log_num), + recycled_(false) {} + +Reader::~Reader() { + delete[] backing_store_; +} + +// For kAbsoluteConsistency, on clean shutdown we don't expect any error +// in the log files. For other modes, we can ignore only incomplete records +// in the last log file, which are presumably due to a write in progress +// during restart (or from log recycling). +// +// TODO krad: Evaluate if we need to move to a more strict mode where we +// restrict the inconsistency to only the last log +bool Reader::ReadRecord(Slice* record, std::string* scratch, + WALRecoveryMode wal_recovery_mode) { + scratch->clear(); + record->clear(); + bool in_fragmented_record = false; + // Record offset of the logical record that we're reading + // 0 is a dummy value to make compilers happy + uint64_t prospective_record_offset = 0; + + Slice fragment; + while (true) { + uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size(); + size_t drop_size = 0; + const unsigned int record_type = ReadPhysicalRecord(&fragment, &drop_size); + switch (record_type) { + case kFullType: + case kRecyclableFullType: + if (in_fragmented_record && !scratch->empty()) { + // Handle bug in earlier versions of log::Writer where + // it could emit an empty kFirstType record at the tail end + // of a block followed by a kFullType or kFirstType record + // at the beginning of the next block. + ReportCorruption(scratch->size(), "partial record without end(1)"); + } + prospective_record_offset = physical_record_offset; + scratch->clear(); + *record = fragment; + last_record_offset_ = prospective_record_offset; + return true; + + case kFirstType: + case kRecyclableFirstType: + if (in_fragmented_record && !scratch->empty()) { + // Handle bug in earlier versions of log::Writer where + // it could emit an empty kFirstType record at the tail end + // of a block followed by a kFullType or kFirstType record + // at the beginning of the next block. + ReportCorruption(scratch->size(), "partial record without end(2)"); + } + prospective_record_offset = physical_record_offset; + scratch->assign(fragment.data(), fragment.size()); + in_fragmented_record = true; + break; + + case kMiddleType: + case kRecyclableMiddleType: + if (!in_fragmented_record) { + ReportCorruption(fragment.size(), + "missing start of fragmented record(1)"); + } else { + scratch->append(fragment.data(), fragment.size()); + } + break; + + case kLastType: + case kRecyclableLastType: + if (!in_fragmented_record) { + ReportCorruption(fragment.size(), + "missing start of fragmented record(2)"); + } else { + scratch->append(fragment.data(), fragment.size()); + *record = Slice(*scratch); + last_record_offset_ = prospective_record_offset; + return true; + } + break; + + case kBadHeader: + if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) { + // in clean shutdown we don't expect any error in the log files + ReportCorruption(drop_size, "truncated header"); + } + FALLTHROUGH_INTENDED; + + case kEof: + if (in_fragmented_record) { + if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) { + // in clean shutdown we don't expect any error in the log files + ReportCorruption(scratch->size(), "error reading trailing data"); + } + // This can be caused by the writer dying immediately after + // writing a physical record but before completing the next; don't + // treat it as a corruption, just ignore the entire logical record. + scratch->clear(); + } + return false; + + case kOldRecord: + if (wal_recovery_mode != WALRecoveryMode::kSkipAnyCorruptedRecords) { + // Treat a record from a previous instance of the log as EOF. + if (in_fragmented_record) { + if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) { + // in clean shutdown we don't expect any error in the log files + ReportCorruption(scratch->size(), "error reading trailing data"); + } + // This can be caused by the writer dying immediately after + // writing a physical record but before completing the next; don't + // treat it as a corruption, just ignore the entire logical record. + scratch->clear(); + } + return false; + } + FALLTHROUGH_INTENDED; + + case kBadRecord: + if (in_fragmented_record) { + ReportCorruption(scratch->size(), "error in middle of record"); + in_fragmented_record = false; + scratch->clear(); + } + break; + + case kBadRecordLen: + case kBadRecordChecksum: + if (recycled_ && + wal_recovery_mode == + WALRecoveryMode::kTolerateCorruptedTailRecords) { + scratch->clear(); + return false; + } + if (record_type == kBadRecordLen) { + ReportCorruption(drop_size, "bad record length"); + } else { + ReportCorruption(drop_size, "checksum mismatch"); + } + if (in_fragmented_record) { + ReportCorruption(scratch->size(), "error in middle of record"); + in_fragmented_record = false; + scratch->clear(); + } + break; + + default: { + char buf[40]; + snprintf(buf, sizeof(buf), "unknown record type %u", record_type); + ReportCorruption( + (fragment.size() + (in_fragmented_record ? scratch->size() : 0)), + buf); + in_fragmented_record = false; + scratch->clear(); + break; + } + } + } + return false; +} + +uint64_t Reader::LastRecordOffset() { + return last_record_offset_; +} + +void Reader::UnmarkEOF() { + if (read_error_) { + return; + } + eof_ = false; + if (eof_offset_ == 0) { + return; + } + UnmarkEOFInternal(); +} + +void Reader::UnmarkEOFInternal() { + // If the EOF was in the middle of a block (a partial block was read) we have + // to read the rest of the block as ReadPhysicalRecord can only read full + // blocks and expects the file position indicator to be aligned to the start + // of a block. + // + // consumed_bytes + buffer_size() + remaining == kBlockSize + + size_t consumed_bytes = eof_offset_ - buffer_.size(); + size_t remaining = kBlockSize - eof_offset_; + + // backing_store_ is used to concatenate what is left in buffer_ and + // the remainder of the block. If buffer_ already uses backing_store_, + // we just append the new data. + if (buffer_.data() != backing_store_ + consumed_bytes) { + // Buffer_ does not use backing_store_ for storage. + // Copy what is left in buffer_ to backing_store. + memmove(backing_store_ + consumed_bytes, buffer_.data(), buffer_.size()); + } + + Slice read_buffer; + Status status = file_->Read(remaining, &read_buffer, + backing_store_ + eof_offset_); + + size_t added = read_buffer.size(); + end_of_buffer_offset_ += added; + + if (!status.ok()) { + if (added > 0) { + ReportDrop(added, status); + } + + read_error_ = true; + return; + } + + if (read_buffer.data() != backing_store_ + eof_offset_) { + // Read did not write to backing_store_ + memmove(backing_store_ + eof_offset_, read_buffer.data(), + read_buffer.size()); + } + + buffer_ = Slice(backing_store_ + consumed_bytes, + eof_offset_ + added - consumed_bytes); + + if (added < remaining) { + eof_ = true; + eof_offset_ += added; + } else { + eof_offset_ = 0; + } +} + +void Reader::ReportCorruption(size_t bytes, const char* reason) { + ReportDrop(bytes, Status::Corruption(reason)); +} + +void Reader::ReportDrop(size_t bytes, const Status& reason) { + if (reporter_ != nullptr) { + reporter_->Corruption(bytes, reason); + } +} + +bool Reader::ReadMore(size_t* drop_size, int *error) { + if (!eof_ && !read_error_) { + // Last read was a full read, so this is a trailer to skip + buffer_.clear(); + Status status = file_->Read(kBlockSize, &buffer_, backing_store_); + end_of_buffer_offset_ += buffer_.size(); + if (!status.ok()) { + buffer_.clear(); + ReportDrop(kBlockSize, status); + read_error_ = true; + *error = kEof; + return false; + } else if (buffer_.size() < static_cast(kBlockSize)) { + eof_ = true; + eof_offset_ = buffer_.size(); + } + return true; + } else { + // Note that if buffer_ is non-empty, we have a truncated header at the + // end of the file, which can be caused by the writer crashing in the + // middle of writing the header. Unless explicitly requested we don't + // considering this an error, just report EOF. + if (buffer_.size()) { + *drop_size = buffer_.size(); + buffer_.clear(); + *error = kBadHeader; + return false; + } + buffer_.clear(); + *error = kEof; + return false; + } +} + +unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size) { + while (true) { + // We need at least the minimum header size + if (buffer_.size() < static_cast(kHeaderSize)) { + // the default value of r is meaningless because ReadMore will overwrite + // it if it returns false; in case it returns true, the return value will + // not be used anyway + int r = kEof; + if (!ReadMore(drop_size, &r)) { + return r; + } + continue; + } + + // Parse the header + const char* header = buffer_.data(); + const uint32_t a = static_cast(header[4]) & 0xff; + const uint32_t b = static_cast(header[5]) & 0xff; + const unsigned int type = header[6]; + const uint32_t length = a | (b << 8); + int header_size = kHeaderSize; + if (type >= kRecyclableFullType && type <= kRecyclableLastType) { + if (end_of_buffer_offset_ - buffer_.size() == 0) { + recycled_ = true; + } + header_size = kRecyclableHeaderSize; + // We need enough for the larger header + if (buffer_.size() < static_cast(kRecyclableHeaderSize)) { + int r = kEof; + if (!ReadMore(drop_size, &r)) { + return r; + } + continue; + } + const uint32_t log_num = DecodeFixed32(header + 7); + if (log_num != log_number_) { + return kOldRecord; + } + } + if (header_size + length > buffer_.size()) { + *drop_size = buffer_.size(); + buffer_.clear(); + if (!eof_) { + return kBadRecordLen; + } + // If the end of the file has been reached without reading |length| + // bytes of payload, assume the writer died in the middle of writing the + // record. Don't report a corruption unless requested. + if (*drop_size) { + return kBadHeader; + } + return kEof; + } + + if (type == kZeroType && length == 0) { + // Skip zero length record without reporting any drops since + // such records are produced by the mmap based writing code in + // env_posix.cc that preallocates file regions. + // NOTE: this should never happen in DB written by new RocksDB versions, + // since we turn off mmap writes to manifest and log files + buffer_.clear(); + return kBadRecord; + } + + // Check crc + if (checksum_) { + uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header)); + uint32_t actual_crc = crc32c::Value(header + 6, length + header_size - 6); + if (actual_crc != expected_crc) { + // Drop the rest of the buffer since "length" itself may have + // been corrupted and if we trust it, we could find some + // fragment of a real log record that just happens to look + // like a valid log record. + *drop_size = buffer_.size(); + buffer_.clear(); + return kBadRecordChecksum; + } + } + + buffer_.remove_prefix(header_size + length); + + *result = Slice(header + header_size, length); + return type; + } +} + +bool FragmentBufferedReader::ReadRecord(Slice* record, std::string* scratch, + WALRecoveryMode /*unused*/) { + assert(record != nullptr); + assert(scratch != nullptr); + record->clear(); + scratch->clear(); + + uint64_t prospective_record_offset = 0; + uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size(); + size_t drop_size = 0; + unsigned int fragment_type_or_err = 0; // Initialize to make compiler happy + Slice fragment; + while (TryReadFragment(&fragment, &drop_size, &fragment_type_or_err)) { + switch (fragment_type_or_err) { + case kFullType: + case kRecyclableFullType: + if (in_fragmented_record_ && !fragments_.empty()) { + ReportCorruption(fragments_.size(), "partial record without end(1)"); + } + fragments_.clear(); + *record = fragment; + prospective_record_offset = physical_record_offset; + last_record_offset_ = prospective_record_offset; + in_fragmented_record_ = false; + return true; + + case kFirstType: + case kRecyclableFirstType: + if (in_fragmented_record_ || !fragments_.empty()) { + ReportCorruption(fragments_.size(), "partial record without end(2)"); + } + prospective_record_offset = physical_record_offset; + fragments_.assign(fragment.data(), fragment.size()); + in_fragmented_record_ = true; + break; + + case kMiddleType: + case kRecyclableMiddleType: + if (!in_fragmented_record_) { + ReportCorruption(fragment.size(), + "missing start of fragmented record(1)"); + } else { + fragments_.append(fragment.data(), fragment.size()); + } + break; + + case kLastType: + case kRecyclableLastType: + if (!in_fragmented_record_) { + ReportCorruption(fragment.size(), + "missing start of fragmented record(2)"); + } else { + fragments_.append(fragment.data(), fragment.size()); + scratch->assign(fragments_.data(), fragments_.size()); + fragments_.clear(); + *record = Slice(*scratch); + last_record_offset_ = prospective_record_offset; + in_fragmented_record_ = false; + return true; + } + break; + + case kBadHeader: + case kBadRecord: + case kEof: + case kOldRecord: + if (in_fragmented_record_) { + ReportCorruption(fragments_.size(), "error in middle of record"); + in_fragmented_record_ = false; + fragments_.clear(); + } + break; + + case kBadRecordChecksum: + if (recycled_) { + fragments_.clear(); + return false; + } + ReportCorruption(drop_size, "checksum mismatch"); + if (in_fragmented_record_) { + ReportCorruption(fragments_.size(), "error in middle of record"); + in_fragmented_record_ = false; + fragments_.clear(); + } + break; + + default: { + char buf[40]; + snprintf(buf, sizeof(buf), "unknown record type %u", + fragment_type_or_err); + ReportCorruption( + fragment.size() + (in_fragmented_record_ ? fragments_.size() : 0), + buf); + in_fragmented_record_ = false; + fragments_.clear(); + break; + } + } + } + return false; +} + +void FragmentBufferedReader::UnmarkEOF() { + if (read_error_) { + return; + } + eof_ = false; + UnmarkEOFInternal(); +} + +bool FragmentBufferedReader::TryReadMore(size_t* drop_size, int* error) { + if (!eof_ && !read_error_) { + // Last read was a full read, so this is a trailer to skip + buffer_.clear(); + Status status = file_->Read(kBlockSize, &buffer_, backing_store_); + end_of_buffer_offset_ += buffer_.size(); + if (!status.ok()) { + buffer_.clear(); + ReportDrop(kBlockSize, status); + read_error_ = true; + *error = kEof; + return false; + } else if (buffer_.size() < static_cast(kBlockSize)) { + eof_ = true; + eof_offset_ = buffer_.size(); + TEST_SYNC_POINT_CALLBACK( + "FragmentBufferedLogReader::TryReadMore:FirstEOF", nullptr); + } + return true; + } else if (!read_error_) { + UnmarkEOF(); + } + if (!read_error_) { + return true; + } + *error = kEof; + *drop_size = buffer_.size(); + if (buffer_.size() > 0) { + *error = kBadHeader; + } + buffer_.clear(); + return false; +} + +// return true if the caller should process the fragment_type_or_err. +bool FragmentBufferedReader::TryReadFragment( + Slice* fragment, size_t* drop_size, unsigned int* fragment_type_or_err) { + assert(fragment != nullptr); + assert(drop_size != nullptr); + assert(fragment_type_or_err != nullptr); + + while (buffer_.size() < static_cast(kHeaderSize)) { + size_t old_size = buffer_.size(); + int error = kEof; + if (!TryReadMore(drop_size, &error)) { + *fragment_type_or_err = error; + return false; + } else if (old_size == buffer_.size()) { + return false; + } + } + const char* header = buffer_.data(); + const uint32_t a = static_cast(header[4]) & 0xff; + const uint32_t b = static_cast(header[5]) & 0xff; + const unsigned int type = header[6]; + const uint32_t length = a | (b << 8); + int header_size = kHeaderSize; + if (type >= kRecyclableFullType && type <= kRecyclableLastType) { + if (end_of_buffer_offset_ - buffer_.size() == 0) { + recycled_ = true; + } + header_size = kRecyclableHeaderSize; + while (buffer_.size() < static_cast(kRecyclableHeaderSize)) { + size_t old_size = buffer_.size(); + int error = kEof; + if (!TryReadMore(drop_size, &error)) { + *fragment_type_or_err = error; + return false; + } else if (old_size == buffer_.size()) { + return false; + } + } + const uint32_t log_num = DecodeFixed32(header + 7); + if (log_num != log_number_) { + *fragment_type_or_err = kOldRecord; + return true; + } + } + + while (header_size + length > buffer_.size()) { + size_t old_size = buffer_.size(); + int error = kEof; + if (!TryReadMore(drop_size, &error)) { + *fragment_type_or_err = error; + return false; + } else if (old_size == buffer_.size()) { + return false; + } + } + + if (type == kZeroType && length == 0) { + buffer_.clear(); + *fragment_type_or_err = kBadRecord; + return true; + } + + if (checksum_) { + uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header)); + uint32_t actual_crc = crc32c::Value(header + 6, length + header_size - 6); + if (actual_crc != expected_crc) { + *drop_size = buffer_.size(); + buffer_.clear(); + *fragment_type_or_err = kBadRecordChecksum; + return true; + } + } + + buffer_.remove_prefix(header_size + length); + + *fragment = Slice(header + header_size, length); + *fragment_type_or_err = type; + return true; +} + +} // namespace log +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/log_reader.h b/src/rocksdb/db/log_reader.h new file mode 100644 index 000000000..293ae957c --- /dev/null +++ b/src/rocksdb/db/log_reader.h @@ -0,0 +1,189 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include + +#include "db/log_format.h" +#include "file/sequence_file_reader.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { +class Logger; + +namespace log { + +/** + * Reader is a general purpose log stream reader implementation. The actual job + * of reading from the device is implemented by the SequentialFile interface. + * + * Please see Writer for details on the file and record layout. + */ +class Reader { + public: + // Interface for reporting errors. + class Reporter { + public: + virtual ~Reporter(); + + // Some corruption was detected. "size" is the approximate number + // of bytes dropped due to the corruption. + virtual void Corruption(size_t bytes, const Status& status) = 0; + }; + + // Create a reader that will return log records from "*file". + // "*file" must remain live while this Reader is in use. + // + // If "reporter" is non-nullptr, it is notified whenever some data is + // dropped due to a detected corruption. "*reporter" must remain + // live while this Reader is in use. + // + // If "checksum" is true, verify checksums if available. + Reader(std::shared_ptr info_log, + // @lint-ignore TXT2 T25377293 Grandfathered in + std::unique_ptr&& file, Reporter* reporter, + bool checksum, uint64_t log_num); + // No copying allowed + Reader(const Reader&) = delete; + void operator=(const Reader&) = delete; + + virtual ~Reader(); + + // Read the next record into *record. Returns true if read + // successfully, false if we hit end of the input. May use + // "*scratch" as temporary storage. The contents filled in *record + // will only be valid until the next mutating operation on this + // reader or the next mutation to *scratch. + virtual bool ReadRecord(Slice* record, std::string* scratch, + WALRecoveryMode wal_recovery_mode = + WALRecoveryMode::kTolerateCorruptedTailRecords); + + // Returns the physical offset of the last record returned by ReadRecord. + // + // Undefined before the first call to ReadRecord. + uint64_t LastRecordOffset(); + + // returns true if the reader has encountered an eof condition. + bool IsEOF() { + return eof_; + } + + // returns true if the reader has encountered read error. + bool hasReadError() const { return read_error_; } + + // when we know more data has been written to the file. we can use this + // function to force the reader to look again in the file. + // Also aligns the file position indicator to the start of the next block + // by reading the rest of the data from the EOF position to the end of the + // block that was partially read. + virtual void UnmarkEOF(); + + SequentialFileReader* file() { return file_.get(); } + + Reporter* GetReporter() const { return reporter_; } + + uint64_t GetLogNumber() const { return log_number_; } + + size_t GetReadOffset() const { + return static_cast(end_of_buffer_offset_); + } + + protected: + std::shared_ptr info_log_; + const std::unique_ptr file_; + Reporter* const reporter_; + bool const checksum_; + char* const backing_store_; + + // Internal state variables used for reading records + Slice buffer_; + bool eof_; // Last Read() indicated EOF by returning < kBlockSize + bool read_error_; // Error occurred while reading from file + + // Offset of the file position indicator within the last block when an + // EOF was detected. + size_t eof_offset_; + + // Offset of the last record returned by ReadRecord. + uint64_t last_record_offset_; + // Offset of the first location past the end of buffer_. + uint64_t end_of_buffer_offset_; + + // which log number this is + uint64_t const log_number_; + + // Whether this is a recycled log file + bool recycled_; + + // Extend record types with the following special values + enum { + kEof = kMaxRecordType + 1, + // Returned whenever we find an invalid physical record. + // Currently there are three situations in which this happens: + // * The record has an invalid CRC (ReadPhysicalRecord reports a drop) + // * The record is a 0-length record (No drop is reported) + kBadRecord = kMaxRecordType + 2, + // Returned when we fail to read a valid header. + kBadHeader = kMaxRecordType + 3, + // Returned when we read an old record from a previous user of the log. + kOldRecord = kMaxRecordType + 4, + // Returned when we get a bad record length + kBadRecordLen = kMaxRecordType + 5, + // Returned when we get a bad record checksum + kBadRecordChecksum = kMaxRecordType + 6, + }; + + // Return type, or one of the preceding special values + unsigned int ReadPhysicalRecord(Slice* result, size_t* drop_size); + + // Read some more + bool ReadMore(size_t* drop_size, int *error); + + void UnmarkEOFInternal(); + + // Reports dropped bytes to the reporter. + // buffer_ must be updated to remove the dropped bytes prior to invocation. + void ReportCorruption(size_t bytes, const char* reason); + void ReportDrop(size_t bytes, const Status& reason); +}; + +class FragmentBufferedReader : public Reader { + public: + FragmentBufferedReader(std::shared_ptr info_log, + // @lint-ignore TXT2 T25377293 Grandfathered in + std::unique_ptr&& _file, + Reporter* reporter, bool checksum, uint64_t log_num) + : Reader(info_log, std::move(_file), reporter, checksum, log_num), + fragments_(), + in_fragmented_record_(false) {} + ~FragmentBufferedReader() override {} + bool ReadRecord(Slice* record, std::string* scratch, + WALRecoveryMode wal_recovery_mode = + WALRecoveryMode::kTolerateCorruptedTailRecords) override; + void UnmarkEOF() override; + + private: + std::string fragments_; + bool in_fragmented_record_; + + bool TryReadFragment(Slice* result, size_t* drop_size, + unsigned int* fragment_type_or_err); + + bool TryReadMore(size_t* drop_size, int* error); + + // No copy allowed + FragmentBufferedReader(const FragmentBufferedReader&); + void operator=(const FragmentBufferedReader&); +}; + +} // namespace log +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/log_test.cc b/src/rocksdb/db/log_test.cc new file mode 100644 index 000000000..849b89d8a --- /dev/null +++ b/src/rocksdb/db/log_test.cc @@ -0,0 +1,928 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "env/composite_env_wrapper.h" +#include "file/sequence_file_reader.h" +#include "file/writable_file_writer.h" +#include "rocksdb/env.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { +namespace log { + +// Construct a string of the specified length made out of the supplied +// partial string. +static std::string BigString(const std::string& partial_string, size_t n) { + std::string result; + while (result.size() < n) { + result.append(partial_string); + } + result.resize(n); + return result; +} + +// Construct a string from a number +static std::string NumberString(int n) { + char buf[50]; + snprintf(buf, sizeof(buf), "%d.", n); + return std::string(buf); +} + +// Return a skewed potentially long string +static std::string RandomSkewedString(int i, Random* rnd) { + return BigString(NumberString(i), rnd->Skewed(17)); +} + +// Param type is tuple +// get<0>(tuple): non-zero if recycling log, zero if regular log +// get<1>(tuple): true if allow retry after read EOF, false otherwise +class LogTest : public ::testing::TestWithParam> { + private: + class StringSource : public SequentialFile { + public: + Slice& contents_; + bool force_error_; + size_t force_error_position_; + bool force_eof_; + size_t force_eof_position_; + bool returned_partial_; + bool fail_after_read_partial_; + explicit StringSource(Slice& contents, bool fail_after_read_partial) + : contents_(contents), + force_error_(false), + force_error_position_(0), + force_eof_(false), + force_eof_position_(0), + returned_partial_(false), + fail_after_read_partial_(fail_after_read_partial) {} + + Status Read(size_t n, Slice* result, char* scratch) override { + if (fail_after_read_partial_) { + EXPECT_TRUE(!returned_partial_) << "must not Read() after eof/error"; + } + + if (force_error_) { + if (force_error_position_ >= n) { + force_error_position_ -= n; + } else { + *result = Slice(contents_.data(), force_error_position_); + contents_.remove_prefix(force_error_position_); + force_error_ = false; + returned_partial_ = true; + return Status::Corruption("read error"); + } + } + + if (contents_.size() < n) { + n = contents_.size(); + returned_partial_ = true; + } + + if (force_eof_) { + if (force_eof_position_ >= n) { + force_eof_position_ -= n; + } else { + force_eof_ = false; + n = force_eof_position_; + returned_partial_ = true; + } + } + + // By using scratch we ensure that caller has control over the + // lifetime of result.data() + memcpy(scratch, contents_.data(), n); + *result = Slice(scratch, n); + + contents_.remove_prefix(n); + return Status::OK(); + } + + Status Skip(uint64_t n) override { + if (n > contents_.size()) { + contents_.clear(); + return Status::NotFound("in-memory file skipepd past end"); + } + + contents_.remove_prefix(n); + + return Status::OK(); + } + }; + + inline StringSource* GetStringSourceFromLegacyReader( + SequentialFileReader* reader) { + LegacySequentialFileWrapper* file = + static_cast(reader->file()); + return static_cast(file->target()); + } + + class ReportCollector : public Reader::Reporter { + public: + size_t dropped_bytes_; + std::string message_; + + ReportCollector() : dropped_bytes_(0) { } + void Corruption(size_t bytes, const Status& status) override { + dropped_bytes_ += bytes; + message_.append(status.ToString()); + } + }; + + std::string& dest_contents() { + auto dest = test::GetStringSinkFromLegacyWriter(writer_.file()); + assert(dest); + return dest->contents_; + } + + const std::string& dest_contents() const { + auto dest = test::GetStringSinkFromLegacyWriter(writer_.file()); + assert(dest); + return dest->contents_; + } + + void reset_source_contents() { + auto src = GetStringSourceFromLegacyReader(reader_->file()); + assert(src); + src->contents_ = dest_contents(); + } + + Slice reader_contents_; + std::unique_ptr dest_holder_; + std::unique_ptr source_holder_; + ReportCollector report_; + Writer writer_; + std::unique_ptr reader_; + + protected: + bool allow_retry_read_; + + public: + LogTest() + : reader_contents_(), + dest_holder_(test::GetWritableFileWriter( + new test::StringSink(&reader_contents_), "" /* don't care */)), + source_holder_(test::GetSequentialFileReader( + new StringSource(reader_contents_, !std::get<1>(GetParam())), + "" /* file name */)), + writer_(std::move(dest_holder_), 123, std::get<0>(GetParam())), + allow_retry_read_(std::get<1>(GetParam())) { + if (allow_retry_read_) { + reader_.reset(new FragmentBufferedReader( + nullptr, std::move(source_holder_), &report_, true /* checksum */, + 123 /* log_number */)); + } else { + reader_.reset(new Reader(nullptr, std::move(source_holder_), &report_, + true /* checksum */, 123 /* log_number */)); + } + } + + Slice* get_reader_contents() { return &reader_contents_; } + + void Write(const std::string& msg) { + writer_.AddRecord(Slice(msg)); + } + + size_t WrittenBytes() const { + return dest_contents().size(); + } + + std::string Read(const WALRecoveryMode wal_recovery_mode = + WALRecoveryMode::kTolerateCorruptedTailRecords) { + std::string scratch; + Slice record; + bool ret = false; + ret = reader_->ReadRecord(&record, &scratch, wal_recovery_mode); + if (ret) { + return record.ToString(); + } else { + return "EOF"; + } + } + + void IncrementByte(int offset, char delta) { + dest_contents()[offset] += delta; + } + + void SetByte(int offset, char new_byte) { + dest_contents()[offset] = new_byte; + } + + void ShrinkSize(int bytes) { + auto dest = test::GetStringSinkFromLegacyWriter(writer_.file()); + assert(dest); + dest->Drop(bytes); + } + + void FixChecksum(int header_offset, int len, bool recyclable) { + // Compute crc of type/len/data + int header_size = recyclable ? kRecyclableHeaderSize : kHeaderSize; + uint32_t crc = crc32c::Value(&dest_contents()[header_offset + 6], + header_size - 6 + len); + crc = crc32c::Mask(crc); + EncodeFixed32(&dest_contents()[header_offset], crc); + } + + void ForceError(size_t position = 0) { + auto src = GetStringSourceFromLegacyReader(reader_->file()); + src->force_error_ = true; + src->force_error_position_ = position; + } + + size_t DroppedBytes() const { + return report_.dropped_bytes_; + } + + std::string ReportMessage() const { + return report_.message_; + } + + void ForceEOF(size_t position = 0) { + auto src = GetStringSourceFromLegacyReader(reader_->file()); + src->force_eof_ = true; + src->force_eof_position_ = position; + } + + void UnmarkEOF() { + auto src = GetStringSourceFromLegacyReader(reader_->file()); + src->returned_partial_ = false; + reader_->UnmarkEOF(); + } + + bool IsEOF() { return reader_->IsEOF(); } + + // Returns OK iff recorded error message contains "msg" + std::string MatchError(const std::string& msg) const { + if (report_.message_.find(msg) == std::string::npos) { + return report_.message_; + } else { + return "OK"; + } + } +}; + +TEST_P(LogTest, Empty) { ASSERT_EQ("EOF", Read()); } + +TEST_P(LogTest, ReadWrite) { + Write("foo"); + Write("bar"); + Write(""); + Write("xxxx"); + ASSERT_EQ("foo", Read()); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("", Read()); + ASSERT_EQ("xxxx", Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ("EOF", Read()); // Make sure reads at eof work +} + +TEST_P(LogTest, ManyBlocks) { + for (int i = 0; i < 100000; i++) { + Write(NumberString(i)); + } + for (int i = 0; i < 100000; i++) { + ASSERT_EQ(NumberString(i), Read()); + } + ASSERT_EQ("EOF", Read()); +} + +TEST_P(LogTest, Fragmentation) { + Write("small"); + Write(BigString("medium", 50000)); + Write(BigString("large", 100000)); + ASSERT_EQ("small", Read()); + ASSERT_EQ(BigString("medium", 50000), Read()); + ASSERT_EQ(BigString("large", 100000), Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST_P(LogTest, MarginalTrailer) { + // Make a trailer that is exactly the same length as an empty record. + int header_size = + std::get<0>(GetParam()) ? kRecyclableHeaderSize : kHeaderSize; + const int n = kBlockSize - 2 * header_size; + Write(BigString("foo", n)); + ASSERT_EQ((unsigned int)(kBlockSize - header_size), WrittenBytes()); + Write(""); + Write("bar"); + ASSERT_EQ(BigString("foo", n), Read()); + ASSERT_EQ("", Read()); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST_P(LogTest, MarginalTrailer2) { + // Make a trailer that is exactly the same length as an empty record. + int header_size = + std::get<0>(GetParam()) ? kRecyclableHeaderSize : kHeaderSize; + const int n = kBlockSize - 2 * header_size; + Write(BigString("foo", n)); + ASSERT_EQ((unsigned int)(kBlockSize - header_size), WrittenBytes()); + Write("bar"); + ASSERT_EQ(BigString("foo", n), Read()); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(0U, DroppedBytes()); + ASSERT_EQ("", ReportMessage()); +} + +TEST_P(LogTest, ShortTrailer) { + int header_size = + std::get<0>(GetParam()) ? kRecyclableHeaderSize : kHeaderSize; + const int n = kBlockSize - 2 * header_size + 4; + Write(BigString("foo", n)); + ASSERT_EQ((unsigned int)(kBlockSize - header_size + 4), WrittenBytes()); + Write(""); + Write("bar"); + ASSERT_EQ(BigString("foo", n), Read()); + ASSERT_EQ("", Read()); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST_P(LogTest, AlignedEof) { + int header_size = + std::get<0>(GetParam()) ? kRecyclableHeaderSize : kHeaderSize; + const int n = kBlockSize - 2 * header_size + 4; + Write(BigString("foo", n)); + ASSERT_EQ((unsigned int)(kBlockSize - header_size + 4), WrittenBytes()); + ASSERT_EQ(BigString("foo", n), Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST_P(LogTest, RandomRead) { + const int N = 500; + Random write_rnd(301); + for (int i = 0; i < N; i++) { + Write(RandomSkewedString(i, &write_rnd)); + } + Random read_rnd(301); + for (int i = 0; i < N; i++) { + ASSERT_EQ(RandomSkewedString(i, &read_rnd), Read()); + } + ASSERT_EQ("EOF", Read()); +} + +// Tests of all the error paths in log_reader.cc follow: + +TEST_P(LogTest, ReadError) { + Write("foo"); + ForceError(); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ((unsigned int)kBlockSize, DroppedBytes()); + ASSERT_EQ("OK", MatchError("read error")); +} + +TEST_P(LogTest, BadRecordType) { + Write("foo"); + // Type is stored in header[6] + IncrementByte(6, 100); + FixChecksum(0, 3, false); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3U, DroppedBytes()); + ASSERT_EQ("OK", MatchError("unknown record type")); +} + +TEST_P(LogTest, TruncatedTrailingRecordIsIgnored) { + Write("foo"); + ShrinkSize(4); // Drop all payload as well as a header byte + ASSERT_EQ("EOF", Read()); + // Truncated last record is ignored, not treated as an error + ASSERT_EQ(0U, DroppedBytes()); + ASSERT_EQ("", ReportMessage()); +} + +TEST_P(LogTest, TruncatedTrailingRecordIsNotIgnored) { + if (allow_retry_read_) { + // If read retry is allowed, then truncated trailing record should not + // raise an error. + return; + } + Write("foo"); + ShrinkSize(4); // Drop all payload as well as a header byte + ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency)); + // Truncated last record is ignored, not treated as an error + ASSERT_GT(DroppedBytes(), 0U); + ASSERT_EQ("OK", MatchError("Corruption: truncated header")); +} + +TEST_P(LogTest, BadLength) { + if (allow_retry_read_) { + // If read retry is allowed, then we should not raise an error when the + // record length specified in header is longer than data currently + // available. It's possible that the body of the record is not written yet. + return; + } + bool recyclable_log = (std::get<0>(GetParam()) != 0); + int header_size = recyclable_log ? kRecyclableHeaderSize : kHeaderSize; + const int kPayloadSize = kBlockSize - header_size; + Write(BigString("bar", kPayloadSize)); + Write("foo"); + // Least significant size byte is stored in header[4]. + IncrementByte(4, 1); + if (!recyclable_log) { + ASSERT_EQ("foo", Read()); + ASSERT_EQ(kBlockSize, DroppedBytes()); + ASSERT_EQ("OK", MatchError("bad record length")); + } else { + ASSERT_EQ("EOF", Read()); + } +} + +TEST_P(LogTest, BadLengthAtEndIsIgnored) { + if (allow_retry_read_) { + // If read retry is allowed, then we should not raise an error when the + // record length specified in header is longer than data currently + // available. It's possible that the body of the record is not written yet. + return; + } + Write("foo"); + ShrinkSize(1); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(0U, DroppedBytes()); + ASSERT_EQ("", ReportMessage()); +} + +TEST_P(LogTest, BadLengthAtEndIsNotIgnored) { + if (allow_retry_read_) { + // If read retry is allowed, then we should not raise an error when the + // record length specified in header is longer than data currently + // available. It's possible that the body of the record is not written yet. + return; + } + Write("foo"); + ShrinkSize(1); + ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency)); + ASSERT_GT(DroppedBytes(), 0U); + ASSERT_EQ("OK", MatchError("Corruption: truncated header")); +} + +TEST_P(LogTest, ChecksumMismatch) { + Write("foooooo"); + IncrementByte(0, 14); + ASSERT_EQ("EOF", Read()); + bool recyclable_log = (std::get<0>(GetParam()) != 0); + if (!recyclable_log) { + ASSERT_EQ(14U, DroppedBytes()); + ASSERT_EQ("OK", MatchError("checksum mismatch")); + } else { + ASSERT_EQ(0U, DroppedBytes()); + ASSERT_EQ("", ReportMessage()); + } +} + +TEST_P(LogTest, UnexpectedMiddleType) { + Write("foo"); + bool recyclable_log = (std::get<0>(GetParam()) != 0); + SetByte(6, static_cast(recyclable_log ? kRecyclableMiddleType + : kMiddleType)); + FixChecksum(0, 3, !!recyclable_log); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3U, DroppedBytes()); + ASSERT_EQ("OK", MatchError("missing start")); +} + +TEST_P(LogTest, UnexpectedLastType) { + Write("foo"); + bool recyclable_log = (std::get<0>(GetParam()) != 0); + SetByte(6, + static_cast(recyclable_log ? kRecyclableLastType : kLastType)); + FixChecksum(0, 3, !!recyclable_log); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3U, DroppedBytes()); + ASSERT_EQ("OK", MatchError("missing start")); +} + +TEST_P(LogTest, UnexpectedFullType) { + Write("foo"); + Write("bar"); + bool recyclable_log = (std::get<0>(GetParam()) != 0); + SetByte( + 6, static_cast(recyclable_log ? kRecyclableFirstType : kFirstType)); + FixChecksum(0, 3, !!recyclable_log); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3U, DroppedBytes()); + ASSERT_EQ("OK", MatchError("partial record without end")); +} + +TEST_P(LogTest, UnexpectedFirstType) { + Write("foo"); + Write(BigString("bar", 100000)); + bool recyclable_log = (std::get<0>(GetParam()) != 0); + SetByte( + 6, static_cast(recyclable_log ? kRecyclableFirstType : kFirstType)); + FixChecksum(0, 3, !!recyclable_log); + ASSERT_EQ(BigString("bar", 100000), Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3U, DroppedBytes()); + ASSERT_EQ("OK", MatchError("partial record without end")); +} + +TEST_P(LogTest, MissingLastIsIgnored) { + Write(BigString("bar", kBlockSize)); + // Remove the LAST block, including header. + ShrinkSize(14); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ("", ReportMessage()); + ASSERT_EQ(0U, DroppedBytes()); +} + +TEST_P(LogTest, MissingLastIsNotIgnored) { + if (allow_retry_read_) { + // If read retry is allowed, then truncated trailing record should not + // raise an error. + return; + } + Write(BigString("bar", kBlockSize)); + // Remove the LAST block, including header. + ShrinkSize(14); + ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency)); + ASSERT_GT(DroppedBytes(), 0U); + ASSERT_EQ("OK", MatchError("Corruption: error reading trailing data")); +} + +TEST_P(LogTest, PartialLastIsIgnored) { + Write(BigString("bar", kBlockSize)); + // Cause a bad record length in the LAST block. + ShrinkSize(1); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ("", ReportMessage()); + ASSERT_EQ(0U, DroppedBytes()); +} + +TEST_P(LogTest, PartialLastIsNotIgnored) { + if (allow_retry_read_) { + // If read retry is allowed, then truncated trailing record should not + // raise an error. + return; + } + Write(BigString("bar", kBlockSize)); + // Cause a bad record length in the LAST block. + ShrinkSize(1); + ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency)); + ASSERT_GT(DroppedBytes(), 0U); + ASSERT_EQ("OK", MatchError( + "Corruption: truncated headerCorruption: " + "error reading trailing data")); +} + +TEST_P(LogTest, ErrorJoinsRecords) { + // Consider two fragmented records: + // first(R1) last(R1) first(R2) last(R2) + // where the middle two fragments disappear. We do not want + // first(R1),last(R2) to get joined and returned as a valid record. + + // Write records that span two blocks + Write(BigString("foo", kBlockSize)); + Write(BigString("bar", kBlockSize)); + Write("correct"); + + // Wipe the middle block + for (unsigned int offset = kBlockSize; offset < 2*kBlockSize; offset++) { + SetByte(offset, 'x'); + } + + bool recyclable_log = (std::get<0>(GetParam()) != 0); + if (!recyclable_log) { + ASSERT_EQ("correct", Read()); + ASSERT_EQ("EOF", Read()); + size_t dropped = DroppedBytes(); + ASSERT_LE(dropped, 2 * kBlockSize + 100); + ASSERT_GE(dropped, 2 * kBlockSize); + } else { + ASSERT_EQ("EOF", Read()); + } +} + +TEST_P(LogTest, ClearEofSingleBlock) { + Write("foo"); + Write("bar"); + bool recyclable_log = (std::get<0>(GetParam()) != 0); + int header_size = recyclable_log ? kRecyclableHeaderSize : kHeaderSize; + ForceEOF(3 + header_size + 2); + ASSERT_EQ("foo", Read()); + UnmarkEOF(); + ASSERT_EQ("bar", Read()); + ASSERT_TRUE(IsEOF()); + ASSERT_EQ("EOF", Read()); + Write("xxx"); + UnmarkEOF(); + ASSERT_EQ("xxx", Read()); + ASSERT_TRUE(IsEOF()); +} + +TEST_P(LogTest, ClearEofMultiBlock) { + size_t num_full_blocks = 5; + bool recyclable_log = (std::get<0>(GetParam()) != 0); + int header_size = recyclable_log ? kRecyclableHeaderSize : kHeaderSize; + size_t n = (kBlockSize - header_size) * num_full_blocks + 25; + Write(BigString("foo", n)); + Write(BigString("bar", n)); + ForceEOF(n + num_full_blocks * header_size + header_size + 3); + ASSERT_EQ(BigString("foo", n), Read()); + ASSERT_TRUE(IsEOF()); + UnmarkEOF(); + ASSERT_EQ(BigString("bar", n), Read()); + ASSERT_TRUE(IsEOF()); + Write(BigString("xxx", n)); + UnmarkEOF(); + ASSERT_EQ(BigString("xxx", n), Read()); + ASSERT_TRUE(IsEOF()); +} + +TEST_P(LogTest, ClearEofError) { + // If an error occurs during Read() in UnmarkEOF(), the records contained + // in the buffer should be returned on subsequent calls of ReadRecord() + // until no more full records are left, whereafter ReadRecord() should return + // false to indicate that it cannot read any further. + + Write("foo"); + Write("bar"); + UnmarkEOF(); + ASSERT_EQ("foo", Read()); + ASSERT_TRUE(IsEOF()); + Write("xxx"); + ForceError(0); + UnmarkEOF(); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); +} + +TEST_P(LogTest, ClearEofError2) { + Write("foo"); + Write("bar"); + UnmarkEOF(); + ASSERT_EQ("foo", Read()); + Write("xxx"); + ForceError(3); + UnmarkEOF(); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ(3U, DroppedBytes()); + ASSERT_EQ("OK", MatchError("read error")); +} + +TEST_P(LogTest, Recycle) { + bool recyclable_log = (std::get<0>(GetParam()) != 0); + if (!recyclable_log) { + return; // test is only valid for recycled logs + } + Write("foo"); + Write("bar"); + Write("baz"); + Write("bif"); + Write("blitz"); + while (get_reader_contents()->size() < log::kBlockSize * 2) { + Write("xxxxxxxxxxxxxxxx"); + } + std::unique_ptr dest_holder(test::GetWritableFileWriter( + new test::OverwritingStringSink(get_reader_contents()), + "" /* don't care */)); + Writer recycle_writer(std::move(dest_holder), 123, true); + recycle_writer.AddRecord(Slice("foooo")); + recycle_writer.AddRecord(Slice("bar")); + ASSERT_GE(get_reader_contents()->size(), log::kBlockSize * 2); + ASSERT_EQ("foooo", Read()); + ASSERT_EQ("bar", Read()); + ASSERT_EQ("EOF", Read()); +} + +INSTANTIATE_TEST_CASE_P(bool, LogTest, + ::testing::Values(std::make_tuple(0, false), + std::make_tuple(0, true), + std::make_tuple(1, false), + std::make_tuple(1, true))); + +class RetriableLogTest : public ::testing::TestWithParam { + private: + class ReportCollector : public Reader::Reporter { + public: + size_t dropped_bytes_; + std::string message_; + + ReportCollector() : dropped_bytes_(0) {} + void Corruption(size_t bytes, const Status& status) override { + dropped_bytes_ += bytes; + message_.append(status.ToString()); + } + }; + + Slice contents_; + std::unique_ptr dest_holder_; + std::unique_ptr log_writer_; + Env* env_; + EnvOptions env_options_; + const std::string test_dir_; + const std::string log_file_; + std::unique_ptr writer_; + std::unique_ptr reader_; + ReportCollector report_; + std::unique_ptr log_reader_; + + public: + RetriableLogTest() + : contents_(), + dest_holder_(nullptr), + log_writer_(nullptr), + env_(Env::Default()), + test_dir_(test::PerThreadDBPath("retriable_log_test")), + log_file_(test_dir_ + "/log"), + writer_(nullptr), + reader_(nullptr), + log_reader_(nullptr) {} + + Status SetupTestEnv() { + dest_holder_.reset(test::GetWritableFileWriter( + new test::StringSink(&contents_), "" /* file name */)); + assert(dest_holder_ != nullptr); + log_writer_.reset(new Writer(std::move(dest_holder_), 123, GetParam())); + assert(log_writer_ != nullptr); + + Status s; + s = env_->CreateDirIfMissing(test_dir_); + std::unique_ptr writable_file; + if (s.ok()) { + s = env_->NewWritableFile(log_file_, &writable_file, env_options_); + } + if (s.ok()) { + writer_.reset(new WritableFileWriter( + NewLegacyWritableFileWrapper(std::move(writable_file)), log_file_, + env_options_)); + assert(writer_ != nullptr); + } + std::unique_ptr seq_file; + if (s.ok()) { + s = env_->NewSequentialFile(log_file_, &seq_file, env_options_); + } + if (s.ok()) { + reader_.reset(new SequentialFileReader( + NewLegacySequentialFileWrapper(seq_file), log_file_)); + assert(reader_ != nullptr); + log_reader_.reset(new FragmentBufferedReader( + nullptr, std::move(reader_), &report_, true /* checksum */, + 123 /* log_number */)); + assert(log_reader_ != nullptr); + } + return s; + } + + std::string contents() { + auto file = test::GetStringSinkFromLegacyWriter(log_writer_->file()); + assert(file != nullptr); + return file->contents_; + } + + void Encode(const std::string& msg) { log_writer_->AddRecord(Slice(msg)); } + + void Write(const Slice& data) { + writer_->Append(data); + writer_->Sync(true); + } + + bool TryRead(std::string* result) { + assert(result != nullptr); + result->clear(); + std::string scratch; + Slice record; + bool r = log_reader_->ReadRecord(&record, &scratch); + if (r) { + result->assign(record.data(), record.size()); + return true; + } else { + return false; + } + } +}; + +TEST_P(RetriableLogTest, TailLog_PartialHeader) { + ASSERT_OK(SetupTestEnv()); + std::vector remaining_bytes_in_last_record; + size_t header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + bool eof = false; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency( + {{"RetriableLogTest::TailLog:AfterPart1", + "RetriableLogTest::TailLog:BeforeReadRecord"}, + {"FragmentBufferedLogReader::TryReadMore:FirstEOF", + "RetriableLogTest::TailLog:BeforePart2"}}); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "FragmentBufferedLogReader::TryReadMore:FirstEOF", + [&](void* /*arg*/) { eof = true; }); + SyncPoint::GetInstance()->EnableProcessing(); + + size_t delta = header_size - 1; + port::Thread log_writer_thread([&]() { + size_t old_sz = contents().size(); + Encode("foo"); + size_t new_sz = contents().size(); + std::string part1 = contents().substr(old_sz, delta); + std::string part2 = + contents().substr(old_sz + delta, new_sz - old_sz - delta); + Write(Slice(part1)); + TEST_SYNC_POINT("RetriableLogTest::TailLog:AfterPart1"); + TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforePart2"); + Write(Slice(part2)); + }); + + std::string record; + port::Thread log_reader_thread([&]() { + TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforeReadRecord"); + while (!TryRead(&record)) { + } + }); + log_reader_thread.join(); + log_writer_thread.join(); + ASSERT_EQ("foo", record); + ASSERT_TRUE(eof); +} + +TEST_P(RetriableLogTest, TailLog_FullHeader) { + ASSERT_OK(SetupTestEnv()); + std::vector remaining_bytes_in_last_record; + size_t header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + bool eof = false; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency( + {{"RetriableLogTest::TailLog:AfterPart1", + "RetriableLogTest::TailLog:BeforeReadRecord"}, + {"FragmentBufferedLogReader::TryReadMore:FirstEOF", + "RetriableLogTest::TailLog:BeforePart2"}}); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "FragmentBufferedLogReader::TryReadMore:FirstEOF", + [&](void* /*arg*/) { eof = true; }); + SyncPoint::GetInstance()->EnableProcessing(); + + size_t delta = header_size + 1; + port::Thread log_writer_thread([&]() { + size_t old_sz = contents().size(); + Encode("foo"); + size_t new_sz = contents().size(); + std::string part1 = contents().substr(old_sz, delta); + std::string part2 = + contents().substr(old_sz + delta, new_sz - old_sz - delta); + Write(Slice(part1)); + TEST_SYNC_POINT("RetriableLogTest::TailLog:AfterPart1"); + TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforePart2"); + Write(Slice(part2)); + ASSERT_TRUE(eof); + }); + + std::string record; + port::Thread log_reader_thread([&]() { + TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforeReadRecord"); + while (!TryRead(&record)) { + } + }); + log_reader_thread.join(); + log_writer_thread.join(); + ASSERT_EQ("foo", record); +} + +TEST_P(RetriableLogTest, NonBlockingReadFullRecord) { + // Clear all sync point callbacks even if this test does not use sync point. + // It is necessary, otherwise the execute of this test may hit a sync point + // with which a callback is registered. The registered callback may access + // some dead variable, causing segfault. + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + ASSERT_OK(SetupTestEnv()); + size_t header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + size_t delta = header_size - 1; + size_t old_sz = contents().size(); + Encode("foo-bar"); + size_t new_sz = contents().size(); + std::string part1 = contents().substr(old_sz, delta); + std::string part2 = + contents().substr(old_sz + delta, new_sz - old_sz - delta); + Write(Slice(part1)); + std::string record; + ASSERT_FALSE(TryRead(&record)); + ASSERT_TRUE(record.empty()); + Write(Slice(part2)); + ASSERT_TRUE(TryRead(&record)); + ASSERT_EQ("foo-bar", record); +} + +INSTANTIATE_TEST_CASE_P(bool, RetriableLogTest, ::testing::Values(0, 2)); + +} // namespace log +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/log_writer.cc b/src/rocksdb/db/log_writer.cc new file mode 100644 index 000000000..0222ee2a7 --- /dev/null +++ b/src/rocksdb/db/log_writer.cc @@ -0,0 +1,162 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/log_writer.h" + +#include +#include "file/writable_file_writer.h" +#include "rocksdb/env.h" +#include "util/coding.h" +#include "util/crc32c.h" + +namespace ROCKSDB_NAMESPACE { +namespace log { + +Writer::Writer(std::unique_ptr&& dest, uint64_t log_number, + bool recycle_log_files, bool manual_flush) + : dest_(std::move(dest)), + block_offset_(0), + log_number_(log_number), + recycle_log_files_(recycle_log_files), + manual_flush_(manual_flush) { + for (int i = 0; i <= kMaxRecordType; i++) { + char t = static_cast(i); + type_crc_[i] = crc32c::Value(&t, 1); + } +} + +Writer::~Writer() { + if (dest_) { + WriteBuffer(); + } +} + +Status Writer::WriteBuffer() { return dest_->Flush(); } + +Status Writer::Close() { + Status s; + if (dest_) { + s = dest_->Close(); + dest_.reset(); + } + return s; +} + +Status Writer::AddRecord(const Slice& slice) { + const char* ptr = slice.data(); + size_t left = slice.size(); + + // Header size varies depending on whether we are recycling or not. + const int header_size = + recycle_log_files_ ? kRecyclableHeaderSize : kHeaderSize; + + // Fragment the record if necessary and emit it. Note that if slice + // is empty, we still want to iterate once to emit a single + // zero-length record + Status s; + bool begin = true; + do { + const int64_t leftover = kBlockSize - block_offset_; + assert(leftover >= 0); + if (leftover < header_size) { + // Switch to a new block + if (leftover > 0) { + // Fill the trailer (literal below relies on kHeaderSize and + // kRecyclableHeaderSize being <= 11) + assert(header_size <= 11); + s = dest_->Append(Slice("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + static_cast(leftover))); + if (!s.ok()) { + break; + } + } + block_offset_ = 0; + } + + // Invariant: we never leave < header_size bytes in a block. + assert(static_cast(kBlockSize - block_offset_) >= header_size); + + const size_t avail = kBlockSize - block_offset_ - header_size; + const size_t fragment_length = (left < avail) ? left : avail; + + RecordType type; + const bool end = (left == fragment_length); + if (begin && end) { + type = recycle_log_files_ ? kRecyclableFullType : kFullType; + } else if (begin) { + type = recycle_log_files_ ? kRecyclableFirstType : kFirstType; + } else if (end) { + type = recycle_log_files_ ? kRecyclableLastType : kLastType; + } else { + type = recycle_log_files_ ? kRecyclableMiddleType : kMiddleType; + } + + s = EmitPhysicalRecord(type, ptr, fragment_length); + ptr += fragment_length; + left -= fragment_length; + begin = false; + } while (s.ok() && left > 0); + + if (s.ok()) { + if (!manual_flush_) { + s = dest_->Flush(); + } + } + + return s; +} + +bool Writer::TEST_BufferIsEmpty() { return dest_->TEST_BufferIsEmpty(); } + +Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) { + assert(n <= 0xffff); // Must fit in two bytes + + size_t header_size; + char buf[kRecyclableHeaderSize]; + + // Format the header + buf[4] = static_cast(n & 0xff); + buf[5] = static_cast(n >> 8); + buf[6] = static_cast(t); + + uint32_t crc = type_crc_[t]; + if (t < kRecyclableFullType) { + // Legacy record format + assert(block_offset_ + kHeaderSize + n <= kBlockSize); + header_size = kHeaderSize; + } else { + // Recyclable record format + assert(block_offset_ + kRecyclableHeaderSize + n <= kBlockSize); + header_size = kRecyclableHeaderSize; + + // Only encode low 32-bits of the 64-bit log number. This means + // we will fail to detect an old record if we recycled a log from + // ~4 billion logs ago, but that is effectively impossible, and + // even if it were we'dbe far more likely to see a false positive + // on the 32-bit CRC. + EncodeFixed32(buf + 7, static_cast(log_number_)); + crc = crc32c::Extend(crc, buf + 7, 4); + } + + // Compute the crc of the record type and the payload. + crc = crc32c::Extend(crc, ptr, n); + crc = crc32c::Mask(crc); // Adjust for storage + EncodeFixed32(buf, crc); + + // Write the header and the payload + Status s = dest_->Append(Slice(buf, header_size)); + if (s.ok()) { + s = dest_->Append(Slice(ptr, n)); + } + block_offset_ += header_size + n; + return s; +} + +} // namespace log +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/log_writer.h b/src/rocksdb/db/log_writer.h new file mode 100644 index 000000000..a7f952edd --- /dev/null +++ b/src/rocksdb/db/log_writer.h @@ -0,0 +1,114 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include + +#include + +#include "db/log_format.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class WritableFileWriter; + +namespace log { + +/** + * Writer is a general purpose log stream writer. It provides an append-only + * abstraction for writing data. The details of the how the data is written is + * handled by the WriteableFile sub-class implementation. + * + * File format: + * + * File is broken down into variable sized records. The format of each record + * is described below. + * +-----+-------------+--+----+----------+------+-- ... ----+ + * File | r0 | r1 |P | r2 | r3 | r4 | | + * +-----+-------------+--+----+----------+------+-- ... ----+ + * <--- kBlockSize ------>|<-- kBlockSize ------>| + * rn = variable size records + * P = Padding + * + * Data is written out in kBlockSize chunks. If next record does not fit + * into the space left, the leftover space will be padded with \0. + * + * Legacy record format: + * + * +---------+-----------+-----------+--- ... ---+ + * |CRC (4B) | Size (2B) | Type (1B) | Payload | + * +---------+-----------+-----------+--- ... ---+ + * + * CRC = 32bit hash computed over the record type and payload using CRC + * Size = Length of the payload data + * Type = Type of record + * (kZeroType, kFullType, kFirstType, kLastType, kMiddleType ) + * The type is used to group a bunch of records together to represent + * blocks that are larger than kBlockSize + * Payload = Byte stream as long as specified by the payload size + * + * Recyclable record format: + * + * +---------+-----------+-----------+----------------+--- ... ---+ + * |CRC (4B) | Size (2B) | Type (1B) | Log number (4B)| Payload | + * +---------+-----------+-----------+----------------+--- ... ---+ + * + * Same as above, with the addition of + * Log number = 32bit log file number, so that we can distinguish between + * records written by the most recent log writer vs a previous one. + */ +class Writer { + public: + // Create a writer that will append data to "*dest". + // "*dest" must be initially empty. + // "*dest" must remain live while this Writer is in use. + explicit Writer(std::unique_ptr&& dest, + uint64_t log_number, bool recycle_log_files, + bool manual_flush = false); + // No copying allowed + Writer(const Writer&) = delete; + void operator=(const Writer&) = delete; + + ~Writer(); + + Status AddRecord(const Slice& slice); + + WritableFileWriter* file() { return dest_.get(); } + const WritableFileWriter* file() const { return dest_.get(); } + + uint64_t get_log_number() const { return log_number_; } + + Status WriteBuffer(); + + Status Close(); + + bool TEST_BufferIsEmpty(); + + private: + std::unique_ptr dest_; + size_t block_offset_; // Current offset in block + uint64_t log_number_; + bool recycle_log_files_; + + // crc32c values for all supported record types. These are + // pre-computed to reduce the overhead of computing the crc of the + // record type stored in the header. + uint32_t type_crc_[kMaxRecordType + 1]; + + Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length); + + // If true, it does not flush after each write. Instead it relies on the upper + // layer to manually does the flush by calling ::WriteBuffer() + bool manual_flush_; +}; + +} // namespace log +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/logs_with_prep_tracker.cc b/src/rocksdb/db/logs_with_prep_tracker.cc new file mode 100644 index 000000000..ff98155c4 --- /dev/null +++ b/src/rocksdb/db/logs_with_prep_tracker.cc @@ -0,0 +1,67 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "db/logs_with_prep_tracker.h" + +#include "port/likely.h" + +namespace ROCKSDB_NAMESPACE { +void LogsWithPrepTracker::MarkLogAsHavingPrepSectionFlushed(uint64_t log) { + assert(log != 0); + std::lock_guard lock(prepared_section_completed_mutex_); + auto it = prepared_section_completed_.find(log); + if (UNLIKELY(it == prepared_section_completed_.end())) { + prepared_section_completed_[log] = 1; + } else { + it->second += 1; + } +} + +void LogsWithPrepTracker::MarkLogAsContainingPrepSection(uint64_t log) { + assert(log != 0); + std::lock_guard lock(logs_with_prep_mutex_); + + auto rit = logs_with_prep_.rbegin(); + bool updated = false; + // Most probably the last log is the one that is being marked for + // having a prepare section; so search from the end. + for (; rit != logs_with_prep_.rend() && rit->log >= log; ++rit) { + if (rit->log == log) { + rit->cnt++; + updated = true; + break; + } + } + if (!updated) { + // We are either at the start, or at a position with rit->log < log + logs_with_prep_.insert(rit.base(), {log, 1}); + } +} + +uint64_t LogsWithPrepTracker::FindMinLogContainingOutstandingPrep() { + std::lock_guard lock(logs_with_prep_mutex_); + auto it = logs_with_prep_.begin(); + // start with the smallest log + for (; it != logs_with_prep_.end();) { + auto min_log = it->log; + { + std::lock_guard lock2(prepared_section_completed_mutex_); + auto completed_it = prepared_section_completed_.find(min_log); + if (completed_it == prepared_section_completed_.end() || + completed_it->second < it->cnt) { + return min_log; + } + assert(completed_it != prepared_section_completed_.end() && + completed_it->second == it->cnt); + prepared_section_completed_.erase(completed_it); + } + // erase from beginning in vector is not efficient but this function is not + // on the fast path. + it = logs_with_prep_.erase(it); + } + // no such log found + return 0; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/logs_with_prep_tracker.h b/src/rocksdb/db/logs_with_prep_tracker.h new file mode 100644 index 000000000..86c88012a --- /dev/null +++ b/src/rocksdb/db/logs_with_prep_tracker.h @@ -0,0 +1,63 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// This class is used to track the log files with outstanding prepare entries. +class LogsWithPrepTracker { + public: + // Called when a transaction prepared in `log` has been committed or aborted. + void MarkLogAsHavingPrepSectionFlushed(uint64_t log); + // Called when a transaction is prepared in `log`. + void MarkLogAsContainingPrepSection(uint64_t log); + // Return the earliest log file with outstanding prepare entries. + uint64_t FindMinLogContainingOutstandingPrep(); + size_t TEST_PreparedSectionCompletedSize() { + return prepared_section_completed_.size(); + } + size_t TEST_LogsWithPrepSize() { return logs_with_prep_.size(); } + + private: + // REQUIRES: logs_with_prep_mutex_ held + // + // sorted list of log numbers still containing prepared data. + // this is used by FindObsoleteFiles to determine which + // flushed logs we must keep around because they still + // contain prepared data which has not been committed or rolled back + struct LogCnt { + uint64_t log; // the log number + uint64_t cnt; // number of prepared sections in the log + }; + std::vector logs_with_prep_; + std::mutex logs_with_prep_mutex_; + + // REQUIRES: prepared_section_completed_mutex_ held + // + // to be used in conjunction with logs_with_prep_. + // once a transaction with data in log L is committed or rolled back + // rather than updating logs_with_prep_ directly we keep track of that + // in prepared_section_completed_ which maps LOG -> instance_count. This helps + // avoiding contention between a commit thread and the prepare threads. + // + // when trying to determine the minimum log still active we first + // consult logs_with_prep_. while that root value maps to + // an equal value in prepared_section_completed_ we erase the log from + // both logs_with_prep_ and prepared_section_completed_. + std::unordered_map prepared_section_completed_; + std::mutex prepared_section_completed_mutex_; + +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/lookup_key.h b/src/rocksdb/db/lookup_key.h new file mode 100644 index 000000000..51e5daed1 --- /dev/null +++ b/src/rocksdb/db/lookup_key.h @@ -0,0 +1,66 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include "rocksdb/db.h" +#include "rocksdb/slice.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +// A helper class useful for DBImpl::Get() +class LookupKey { + public: + // Initialize *this for looking up user_key at a snapshot with + // the specified sequence number. + LookupKey(const Slice& _user_key, SequenceNumber sequence, + const Slice* ts = nullptr); + + ~LookupKey(); + + // Return a key suitable for lookup in a MemTable. + Slice memtable_key() const { + return Slice(start_, static_cast(end_ - start_)); + } + + // Return an internal key (suitable for passing to an internal iterator) + Slice internal_key() const { + return Slice(kstart_, static_cast(end_ - kstart_)); + } + + // Return the user key + Slice user_key() const { + return Slice(kstart_, static_cast(end_ - kstart_ - 8)); + } + + private: + // We construct a char array of the form: + // klength varint32 <-- start_ + // userkey char[klength] <-- kstart_ + // tag uint64 + // <-- end_ + // The array is a suitable MemTable key. + // The suffix starting with "userkey" can be used as an InternalKey. + const char* start_; + const char* kstart_; + const char* end_; + char space_[200]; // Avoid allocation for short keys + + // No copying allowed + LookupKey(const LookupKey&); + void operator=(const LookupKey&); +}; + +inline LookupKey::~LookupKey() { + if (start_ != space_) delete[] start_; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/malloc_stats.cc b/src/rocksdb/db/malloc_stats.cc new file mode 100644 index 000000000..12824e516 --- /dev/null +++ b/src/rocksdb/db/malloc_stats.cc @@ -0,0 +1,54 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/malloc_stats.h" + +#ifndef ROCKSDB_LITE +#include +#include + +#include "port/jemalloc_helper.h" + +namespace ROCKSDB_NAMESPACE { + +#ifdef ROCKSDB_JEMALLOC + +typedef struct { + char* cur; + char* end; +} MallocStatus; + +static void GetJemallocStatus(void* mstat_arg, const char* status) { + MallocStatus* mstat = reinterpret_cast(mstat_arg); + size_t status_len = status ? strlen(status) : 0; + size_t buf_size = (size_t)(mstat->end - mstat->cur); + if (!status_len || status_len > buf_size) { + return; + } + + snprintf(mstat->cur, buf_size, "%s", status); + mstat->cur += status_len; +} +void DumpMallocStats(std::string* stats) { + if (!HasJemalloc()) { + return; + } + MallocStatus mstat; + const unsigned int kMallocStatusLen = 1000000; + std::unique_ptr buf{new char[kMallocStatusLen + 1]}; + mstat.cur = buf.get(); + mstat.end = buf.get() + kMallocStatusLen; + malloc_stats_print(GetJemallocStatus, &mstat, ""); + stats->append(buf.get()); +} +#else +void DumpMallocStats(std::string*) {} +#endif // ROCKSDB_JEMALLOC +} // namespace ROCKSDB_NAMESPACE +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/malloc_stats.h b/src/rocksdb/db/malloc_stats.h new file mode 100644 index 000000000..18aff3ad0 --- /dev/null +++ b/src/rocksdb/db/malloc_stats.h @@ -0,0 +1,24 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#ifndef ROCKSDB_LITE + +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +void DumpMallocStats(std::string*); + +} + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/manual_compaction_test.cc b/src/rocksdb/db/manual_compaction_test.cc new file mode 100644 index 000000000..22cd919b5 --- /dev/null +++ b/src/rocksdb/db/manual_compaction_test.cc @@ -0,0 +1,160 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Test for issue 178: a manual compaction causes deleted data to reappear. +#include +#include +#include + +#include "port/port.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/db.h" +#include "rocksdb/slice.h" +#include "rocksdb/write_batch.h" +#include "test_util/testharness.h" + +using namespace ROCKSDB_NAMESPACE; + +namespace { + +// Reasoning: previously the number was 1100000. Since the keys are written to +// the batch in one write each write will result into one SST file. each write +// will result into one SST file. We reduced the write_buffer_size to 1K to +// basically have the same effect with however less number of keys, which +// results into less test runtime. +const int kNumKeys = 1100; + +std::string Key1(int i) { + char buf[100]; + snprintf(buf, sizeof(buf), "my_key_%d", i); + return buf; +} + +std::string Key2(int i) { + return Key1(i) + "_xxx"; +} + +class ManualCompactionTest : public testing::Test { + public: + ManualCompactionTest() { + // Get rid of any state from an old run. + dbname_ = ROCKSDB_NAMESPACE::test::PerThreadDBPath("rocksdb_cbug_test"); + DestroyDB(dbname_, ROCKSDB_NAMESPACE::Options()); + } + + std::string dbname_; +}; + +class DestroyAllCompactionFilter : public CompactionFilter { + public: + DestroyAllCompactionFilter() {} + + bool Filter(int /*level*/, const Slice& /*key*/, const Slice& existing_value, + std::string* /*new_value*/, + bool* /*value_changed*/) const override { + return existing_value.ToString() == "destroy"; + } + + const char* Name() const override { return "DestroyAllCompactionFilter"; } +}; + +TEST_F(ManualCompactionTest, CompactTouchesAllKeys) { + for (int iter = 0; iter < 2; ++iter) { + DB* db; + Options options; + if (iter == 0) { // level compaction + options.num_levels = 3; + options.compaction_style = kCompactionStyleLevel; + } else { // universal compaction + options.compaction_style = kCompactionStyleUniversal; + } + options.create_if_missing = true; + options.compression = ROCKSDB_NAMESPACE::kNoCompression; + options.compaction_filter = new DestroyAllCompactionFilter(); + ASSERT_OK(DB::Open(options, dbname_, &db)); + + db->Put(WriteOptions(), Slice("key1"), Slice("destroy")); + db->Put(WriteOptions(), Slice("key2"), Slice("destroy")); + db->Put(WriteOptions(), Slice("key3"), Slice("value3")); + db->Put(WriteOptions(), Slice("key4"), Slice("destroy")); + + Slice key4("key4"); + db->CompactRange(CompactRangeOptions(), nullptr, &key4); + Iterator* itr = db->NewIterator(ReadOptions()); + itr->SeekToFirst(); + ASSERT_TRUE(itr->Valid()); + ASSERT_EQ("key3", itr->key().ToString()); + itr->Next(); + ASSERT_TRUE(!itr->Valid()); + delete itr; + + delete options.compaction_filter; + delete db; + DestroyDB(dbname_, options); + } +} + +TEST_F(ManualCompactionTest, Test) { + // Open database. Disable compression since it affects the creation + // of layers and the code below is trying to test against a very + // specific scenario. + ROCKSDB_NAMESPACE::DB* db; + ROCKSDB_NAMESPACE::Options db_options; + db_options.write_buffer_size = 1024; + db_options.create_if_missing = true; + db_options.compression = ROCKSDB_NAMESPACE::kNoCompression; + ASSERT_OK(ROCKSDB_NAMESPACE::DB::Open(db_options, dbname_, &db)); + + // create first key range + ROCKSDB_NAMESPACE::WriteBatch batch; + for (int i = 0; i < kNumKeys; i++) { + batch.Put(Key1(i), "value for range 1 key"); + } + ASSERT_OK(db->Write(ROCKSDB_NAMESPACE::WriteOptions(), &batch)); + + // create second key range + batch.Clear(); + for (int i = 0; i < kNumKeys; i++) { + batch.Put(Key2(i), "value for range 2 key"); + } + ASSERT_OK(db->Write(ROCKSDB_NAMESPACE::WriteOptions(), &batch)); + + // delete second key range + batch.Clear(); + for (int i = 0; i < kNumKeys; i++) { + batch.Delete(Key2(i)); + } + ASSERT_OK(db->Write(ROCKSDB_NAMESPACE::WriteOptions(), &batch)); + + // compact database + std::string start_key = Key1(0); + std::string end_key = Key1(kNumKeys - 1); + ROCKSDB_NAMESPACE::Slice least(start_key.data(), start_key.size()); + ROCKSDB_NAMESPACE::Slice greatest(end_key.data(), end_key.size()); + + // commenting out the line below causes the example to work correctly + db->CompactRange(CompactRangeOptions(), &least, &greatest); + + // count the keys + ROCKSDB_NAMESPACE::Iterator* iter = + db->NewIterator(ROCKSDB_NAMESPACE::ReadOptions()); + int num_keys = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + num_keys++; + } + delete iter; + ASSERT_EQ(kNumKeys, num_keys) << "Bad number of keys"; + + // close database + delete db; + DestroyDB(dbname_, ROCKSDB_NAMESPACE::Options()); +} + +} // anonymous namespace + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/memtable.cc b/src/rocksdb/db/memtable.cc new file mode 100644 index 000000000..45483ea09 --- /dev/null +++ b/src/rocksdb/db/memtable.cc @@ -0,0 +1,1122 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/memtable.h" + +#include +#include +#include +#include +#include "db/dbformat.h" +#include "db/merge_context.h" +#include "db/merge_helper.h" +#include "db/pinned_iterators_manager.h" +#include "db/range_tombstone_fragmenter.h" +#include "db/read_callback.h" +#include "memory/arena.h" +#include "memory/memory_usage.h" +#include "monitoring/perf_context_imp.h" +#include "monitoring/statistics.h" +#include "port/port.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/write_buffer_manager.h" +#include "table/internal_iterator.h" +#include "table/iterator_wrapper.h" +#include "table/merging_iterator.h" +#include "util/autovector.h" +#include "util/coding.h" +#include "util/mutexlock.h" +#include "util/util.h" + +namespace ROCKSDB_NAMESPACE { + +ImmutableMemTableOptions::ImmutableMemTableOptions( + const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options) + : arena_block_size(mutable_cf_options.arena_block_size), + memtable_prefix_bloom_bits( + static_cast( + static_cast(mutable_cf_options.write_buffer_size) * + mutable_cf_options.memtable_prefix_bloom_size_ratio) * + 8u), + memtable_huge_page_size(mutable_cf_options.memtable_huge_page_size), + memtable_whole_key_filtering( + mutable_cf_options.memtable_whole_key_filtering), + inplace_update_support(ioptions.inplace_update_support), + inplace_update_num_locks(mutable_cf_options.inplace_update_num_locks), + inplace_callback(ioptions.inplace_callback), + max_successive_merges(mutable_cf_options.max_successive_merges), + statistics(ioptions.statistics), + merge_operator(ioptions.merge_operator), + info_log(ioptions.info_log) {} + +MemTable::MemTable(const InternalKeyComparator& cmp, + const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options, + WriteBufferManager* write_buffer_manager, + SequenceNumber latest_seq, uint32_t column_family_id) + : comparator_(cmp), + moptions_(ioptions, mutable_cf_options), + refs_(0), + kArenaBlockSize(OptimizeBlockSize(moptions_.arena_block_size)), + mem_tracker_(write_buffer_manager), + arena_(moptions_.arena_block_size, + (write_buffer_manager != nullptr && + (write_buffer_manager->enabled() || + write_buffer_manager->cost_to_cache())) + ? &mem_tracker_ + : nullptr, + mutable_cf_options.memtable_huge_page_size), + table_(ioptions.memtable_factory->CreateMemTableRep( + comparator_, &arena_, mutable_cf_options.prefix_extractor.get(), + ioptions.info_log, column_family_id)), + range_del_table_(SkipListFactory().CreateMemTableRep( + comparator_, &arena_, nullptr /* transform */, ioptions.info_log, + column_family_id)), + is_range_del_table_empty_(true), + data_size_(0), + num_entries_(0), + num_deletes_(0), + write_buffer_size_(mutable_cf_options.write_buffer_size), + flush_in_progress_(false), + flush_completed_(false), + file_number_(0), + first_seqno_(0), + earliest_seqno_(latest_seq), + creation_seq_(latest_seq), + mem_next_logfile_number_(0), + min_prep_log_referenced_(0), + locks_(moptions_.inplace_update_support + ? moptions_.inplace_update_num_locks + : 0), + prefix_extractor_(mutable_cf_options.prefix_extractor.get()), + flush_state_(FLUSH_NOT_REQUESTED), + env_(ioptions.env), + insert_with_hint_prefix_extractor_( + ioptions.memtable_insert_with_hint_prefix_extractor), + oldest_key_time_(std::numeric_limits::max()), + atomic_flush_seqno_(kMaxSequenceNumber), + approximate_memory_usage_(0) { + UpdateFlushState(); + // something went wrong if we need to flush before inserting anything + assert(!ShouldScheduleFlush()); + + // use bloom_filter_ for both whole key and prefix bloom filter + if ((prefix_extractor_ || moptions_.memtable_whole_key_filtering) && + moptions_.memtable_prefix_bloom_bits > 0) { + bloom_filter_.reset( + new DynamicBloom(&arena_, moptions_.memtable_prefix_bloom_bits, + 6 /* hard coded 6 probes */, + moptions_.memtable_huge_page_size, ioptions.info_log)); + } +} + +MemTable::~MemTable() { + mem_tracker_.FreeMem(); + assert(refs_ == 0); +} + +size_t MemTable::ApproximateMemoryUsage() { + autovector usages = { + arena_.ApproximateMemoryUsage(), table_->ApproximateMemoryUsage(), + range_del_table_->ApproximateMemoryUsage(), + ROCKSDB_NAMESPACE::ApproximateMemoryUsage(insert_hints_)}; + size_t total_usage = 0; + for (size_t usage : usages) { + // If usage + total_usage >= kMaxSizet, return kMaxSizet. + // the following variation is to avoid numeric overflow. + if (usage >= port::kMaxSizet - total_usage) { + return port::kMaxSizet; + } + total_usage += usage; + } + approximate_memory_usage_.store(total_usage, std::memory_order_relaxed); + // otherwise, return the actual usage + return total_usage; +} + +bool MemTable::ShouldFlushNow() { + size_t write_buffer_size = write_buffer_size_.load(std::memory_order_relaxed); + // In a lot of times, we cannot allocate arena blocks that exactly matches the + // buffer size. Thus we have to decide if we should over-allocate or + // under-allocate. + // This constant variable can be interpreted as: if we still have more than + // "kAllowOverAllocationRatio * kArenaBlockSize" space left, we'd try to over + // allocate one more block. + const double kAllowOverAllocationRatio = 0.6; + + // If arena still have room for new block allocation, we can safely say it + // shouldn't flush. + auto allocated_memory = table_->ApproximateMemoryUsage() + + range_del_table_->ApproximateMemoryUsage() + + arena_.MemoryAllocatedBytes(); + + approximate_memory_usage_.store(allocated_memory, std::memory_order_relaxed); + + // if we can still allocate one more block without exceeding the + // over-allocation ratio, then we should not flush. + if (allocated_memory + kArenaBlockSize < + write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) { + return false; + } + + // if user keeps adding entries that exceeds write_buffer_size, we need to + // flush earlier even though we still have much available memory left. + if (allocated_memory > + write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) { + return true; + } + + // In this code path, Arena has already allocated its "last block", which + // means the total allocatedmemory size is either: + // (1) "moderately" over allocated the memory (no more than `0.6 * arena + // block size`. Or, + // (2) the allocated memory is less than write buffer size, but we'll stop + // here since if we allocate a new arena block, we'll over allocate too much + // more (half of the arena block size) memory. + // + // In either case, to avoid over-allocate, the last block will stop allocation + // when its usage reaches a certain ratio, which we carefully choose "0.75 + // full" as the stop condition because it addresses the following issue with + // great simplicity: What if the next inserted entry's size is + // bigger than AllocatedAndUnused()? + // + // The answer is: if the entry size is also bigger than 0.25 * + // kArenaBlockSize, a dedicated block will be allocated for it; otherwise + // arena will anyway skip the AllocatedAndUnused() and allocate a new, empty + // and regular block. In either case, we *overly* over-allocated. + // + // Therefore, setting the last block to be at most "0.75 full" avoids both + // cases. + // + // NOTE: the average percentage of waste space of this approach can be counted + // as: "arena block size * 0.25 / write buffer size". User who specify a small + // write buffer size and/or big arena block size may suffer. + return arena_.AllocatedAndUnused() < kArenaBlockSize / 4; +} + +void MemTable::UpdateFlushState() { + auto state = flush_state_.load(std::memory_order_relaxed); + if (state == FLUSH_NOT_REQUESTED && ShouldFlushNow()) { + // ignore CAS failure, because that means somebody else requested + // a flush + flush_state_.compare_exchange_strong(state, FLUSH_REQUESTED, + std::memory_order_relaxed, + std::memory_order_relaxed); + } +} + +void MemTable::UpdateOldestKeyTime() { + uint64_t oldest_key_time = oldest_key_time_.load(std::memory_order_relaxed); + if (oldest_key_time == std::numeric_limits::max()) { + int64_t current_time = 0; + auto s = env_->GetCurrentTime(¤t_time); + if (s.ok()) { + assert(current_time >= 0); + // If fail, the timestamp is already set. + oldest_key_time_.compare_exchange_strong( + oldest_key_time, static_cast(current_time), + std::memory_order_relaxed, std::memory_order_relaxed); + } + } +} + +int MemTable::KeyComparator::operator()(const char* prefix_len_key1, + const char* prefix_len_key2) const { + // Internal keys are encoded as length-prefixed strings. + Slice k1 = GetLengthPrefixedSlice(prefix_len_key1); + Slice k2 = GetLengthPrefixedSlice(prefix_len_key2); + return comparator.CompareKeySeq(k1, k2); +} + +int MemTable::KeyComparator::operator()(const char* prefix_len_key, + const KeyComparator::DecodedType& key) + const { + // Internal keys are encoded as length-prefixed strings. + Slice a = GetLengthPrefixedSlice(prefix_len_key); + return comparator.CompareKeySeq(a, key); +} + +void MemTableRep::InsertConcurrently(KeyHandle /*handle*/) { +#ifndef ROCKSDB_LITE + throw std::runtime_error("concurrent insert not supported"); +#else + abort(); +#endif +} + +Slice MemTableRep::UserKey(const char* key) const { + Slice slice = GetLengthPrefixedSlice(key); + return Slice(slice.data(), slice.size() - 8); +} + +KeyHandle MemTableRep::Allocate(const size_t len, char** buf) { + *buf = allocator_->Allocate(len); + return static_cast(*buf); +} + +// Encode a suitable internal key target for "target" and return it. +// Uses *scratch as scratch space, and the returned pointer will point +// into this scratch space. +const char* EncodeKey(std::string* scratch, const Slice& target) { + scratch->clear(); + PutVarint32(scratch, static_cast(target.size())); + scratch->append(target.data(), target.size()); + return scratch->data(); +} + +class MemTableIterator : public InternalIterator { + public: + MemTableIterator(const MemTable& mem, const ReadOptions& read_options, + Arena* arena, bool use_range_del_table = false) + : bloom_(nullptr), + prefix_extractor_(mem.prefix_extractor_), + comparator_(mem.comparator_), + valid_(false), + arena_mode_(arena != nullptr), + value_pinned_( + !mem.GetImmutableMemTableOptions()->inplace_update_support) { + if (use_range_del_table) { + iter_ = mem.range_del_table_->GetIterator(arena); + } else if (prefix_extractor_ != nullptr && !read_options.total_order_seek && + !read_options.auto_prefix_mode) { + // Auto prefix mode is not implemented in memtable yet. + bloom_ = mem.bloom_filter_.get(); + iter_ = mem.table_->GetDynamicPrefixIterator(arena); + } else { + iter_ = mem.table_->GetIterator(arena); + } + } + // No copying allowed + MemTableIterator(const MemTableIterator&) = delete; + void operator=(const MemTableIterator&) = delete; + + ~MemTableIterator() override { +#ifndef NDEBUG + // Assert that the MemTableIterator is never deleted while + // Pinning is Enabled. + assert(!pinned_iters_mgr_ || !pinned_iters_mgr_->PinningEnabled()); +#endif + if (arena_mode_) { + iter_->~Iterator(); + } else { + delete iter_; + } + } + +#ifndef NDEBUG + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { + pinned_iters_mgr_ = pinned_iters_mgr; + } + PinnedIteratorsManager* pinned_iters_mgr_ = nullptr; +#endif + + bool Valid() const override { return valid_; } + void Seek(const Slice& k) override { + PERF_TIMER_GUARD(seek_on_memtable_time); + PERF_COUNTER_ADD(seek_on_memtable_count, 1); + if (bloom_) { + // iterator should only use prefix bloom filter + Slice user_k(ExtractUserKey(k)); + if (prefix_extractor_->InDomain(user_k) && + !bloom_->MayContain(prefix_extractor_->Transform(user_k))) { + PERF_COUNTER_ADD(bloom_memtable_miss_count, 1); + valid_ = false; + return; + } else { + PERF_COUNTER_ADD(bloom_memtable_hit_count, 1); + } + } + iter_->Seek(k, nullptr); + valid_ = iter_->Valid(); + } + void SeekForPrev(const Slice& k) override { + PERF_TIMER_GUARD(seek_on_memtable_time); + PERF_COUNTER_ADD(seek_on_memtable_count, 1); + if (bloom_) { + Slice user_k(ExtractUserKey(k)); + if (prefix_extractor_->InDomain(user_k) && + !bloom_->MayContain(prefix_extractor_->Transform(user_k))) { + PERF_COUNTER_ADD(bloom_memtable_miss_count, 1); + valid_ = false; + return; + } else { + PERF_COUNTER_ADD(bloom_memtable_hit_count, 1); + } + } + iter_->Seek(k, nullptr); + valid_ = iter_->Valid(); + if (!Valid()) { + SeekToLast(); + } + while (Valid() && comparator_.comparator.Compare(k, key()) < 0) { + Prev(); + } + } + void SeekToFirst() override { + iter_->SeekToFirst(); + valid_ = iter_->Valid(); + } + void SeekToLast() override { + iter_->SeekToLast(); + valid_ = iter_->Valid(); + } + void Next() override { + PERF_COUNTER_ADD(next_on_memtable_count, 1); + assert(Valid()); + iter_->Next(); + valid_ = iter_->Valid(); + } + void Prev() override { + PERF_COUNTER_ADD(prev_on_memtable_count, 1); + assert(Valid()); + iter_->Prev(); + valid_ = iter_->Valid(); + } + Slice key() const override { + assert(Valid()); + return GetLengthPrefixedSlice(iter_->key()); + } + Slice value() const override { + assert(Valid()); + Slice key_slice = GetLengthPrefixedSlice(iter_->key()); + return GetLengthPrefixedSlice(key_slice.data() + key_slice.size()); + } + + Status status() const override { return Status::OK(); } + + bool IsKeyPinned() const override { + // memtable data is always pinned + return true; + } + + bool IsValuePinned() const override { + // memtable value is always pinned, except if we allow inplace update. + return value_pinned_; + } + + private: + DynamicBloom* bloom_; + const SliceTransform* const prefix_extractor_; + const MemTable::KeyComparator comparator_; + MemTableRep::Iterator* iter_; + bool valid_; + bool arena_mode_; + bool value_pinned_; +}; + +InternalIterator* MemTable::NewIterator(const ReadOptions& read_options, + Arena* arena) { + assert(arena != nullptr); + auto mem = arena->AllocateAligned(sizeof(MemTableIterator)); + return new (mem) MemTableIterator(*this, read_options, arena); +} + +FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator( + const ReadOptions& read_options, SequenceNumber read_seq) { + if (read_options.ignore_range_deletions || + is_range_del_table_empty_.load(std::memory_order_relaxed)) { + return nullptr; + } + auto* unfragmented_iter = new MemTableIterator( + *this, read_options, nullptr /* arena */, true /* use_range_del_table */); + if (unfragmented_iter == nullptr) { + return nullptr; + } + auto fragmented_tombstone_list = + std::make_shared( + std::unique_ptr(unfragmented_iter), + comparator_.comparator); + + auto* fragmented_iter = new FragmentedRangeTombstoneIterator( + fragmented_tombstone_list, comparator_.comparator, read_seq); + return fragmented_iter; +} + +port::RWMutex* MemTable::GetLock(const Slice& key) { + return &locks_[fastrange64(GetSliceNPHash64(key), locks_.size())]; +} + +MemTable::MemTableStats MemTable::ApproximateStats(const Slice& start_ikey, + const Slice& end_ikey) { + uint64_t entry_count = table_->ApproximateNumEntries(start_ikey, end_ikey); + entry_count += range_del_table_->ApproximateNumEntries(start_ikey, end_ikey); + if (entry_count == 0) { + return {0, 0}; + } + uint64_t n = num_entries_.load(std::memory_order_relaxed); + if (n == 0) { + return {0, 0}; + } + if (entry_count > n) { + // (range_del_)table_->ApproximateNumEntries() is just an estimate so it can + // be larger than actual entries we have. Cap it to entries we have to limit + // the inaccuracy. + entry_count = n; + } + uint64_t data_size = data_size_.load(std::memory_order_relaxed); + return {entry_count * (data_size / n), entry_count}; +} + +bool MemTable::Add(SequenceNumber s, ValueType type, + const Slice& key, /* user key */ + const Slice& value, bool allow_concurrent, + MemTablePostProcessInfo* post_process_info, void** hint) { + // Format of an entry is concatenation of: + // key_size : varint32 of internal_key.size() + // key bytes : char[internal_key.size()] + // value_size : varint32 of value.size() + // value bytes : char[value.size()] + uint32_t key_size = static_cast(key.size()); + uint32_t val_size = static_cast(value.size()); + uint32_t internal_key_size = key_size + 8; + const uint32_t encoded_len = VarintLength(internal_key_size) + + internal_key_size + VarintLength(val_size) + + val_size; + char* buf = nullptr; + std::unique_ptr& table = + type == kTypeRangeDeletion ? range_del_table_ : table_; + KeyHandle handle = table->Allocate(encoded_len, &buf); + + char* p = EncodeVarint32(buf, internal_key_size); + memcpy(p, key.data(), key_size); + Slice key_slice(p, key_size); + p += key_size; + uint64_t packed = PackSequenceAndType(s, type); + EncodeFixed64(p, packed); + p += 8; + p = EncodeVarint32(p, val_size); + memcpy(p, value.data(), val_size); + assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len); + size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size(); + + if (!allow_concurrent) { + // Extract prefix for insert with hint. + if (insert_with_hint_prefix_extractor_ != nullptr && + insert_with_hint_prefix_extractor_->InDomain(key_slice)) { + Slice prefix = insert_with_hint_prefix_extractor_->Transform(key_slice); + bool res = table->InsertKeyWithHint(handle, &insert_hints_[prefix]); + if (UNLIKELY(!res)) { + return res; + } + } else { + bool res = table->InsertKey(handle); + if (UNLIKELY(!res)) { + return res; + } + } + + // this is a bit ugly, but is the way to avoid locked instructions + // when incrementing an atomic + num_entries_.store(num_entries_.load(std::memory_order_relaxed) + 1, + std::memory_order_relaxed); + data_size_.store(data_size_.load(std::memory_order_relaxed) + encoded_len, + std::memory_order_relaxed); + if (type == kTypeDeletion) { + num_deletes_.store(num_deletes_.load(std::memory_order_relaxed) + 1, + std::memory_order_relaxed); + } + + if (bloom_filter_ && prefix_extractor_ && + prefix_extractor_->InDomain(key)) { + bloom_filter_->Add(prefix_extractor_->Transform(key)); + } + if (bloom_filter_ && moptions_.memtable_whole_key_filtering) { + bloom_filter_->Add(StripTimestampFromUserKey(key, ts_sz)); + } + + // The first sequence number inserted into the memtable + assert(first_seqno_ == 0 || s >= first_seqno_); + if (first_seqno_ == 0) { + first_seqno_.store(s, std::memory_order_relaxed); + + if (earliest_seqno_ == kMaxSequenceNumber) { + earliest_seqno_.store(GetFirstSequenceNumber(), + std::memory_order_relaxed); + } + assert(first_seqno_.load() >= earliest_seqno_.load()); + } + assert(post_process_info == nullptr); + UpdateFlushState(); + } else { + bool res = (hint == nullptr) + ? table->InsertKeyConcurrently(handle) + : table->InsertKeyWithHintConcurrently(handle, hint); + if (UNLIKELY(!res)) { + return res; + } + + assert(post_process_info != nullptr); + post_process_info->num_entries++; + post_process_info->data_size += encoded_len; + if (type == kTypeDeletion) { + post_process_info->num_deletes++; + } + + if (bloom_filter_ && prefix_extractor_ && + prefix_extractor_->InDomain(key)) { + bloom_filter_->AddConcurrently(prefix_extractor_->Transform(key)); + } + if (bloom_filter_ && moptions_.memtable_whole_key_filtering) { + bloom_filter_->AddConcurrently(StripTimestampFromUserKey(key, ts_sz)); + } + + // atomically update first_seqno_ and earliest_seqno_. + uint64_t cur_seq_num = first_seqno_.load(std::memory_order_relaxed); + while ((cur_seq_num == 0 || s < cur_seq_num) && + !first_seqno_.compare_exchange_weak(cur_seq_num, s)) { + } + uint64_t cur_earliest_seqno = + earliest_seqno_.load(std::memory_order_relaxed); + while ( + (cur_earliest_seqno == kMaxSequenceNumber || s < cur_earliest_seqno) && + !first_seqno_.compare_exchange_weak(cur_earliest_seqno, s)) { + } + } + if (type == kTypeRangeDeletion) { + is_range_del_table_empty_.store(false, std::memory_order_relaxed); + } + UpdateOldestKeyTime(); + return true; +} + +// Callback from MemTable::Get() +namespace { + +struct Saver { + Status* status; + const LookupKey* key; + bool* found_final_value; // Is value set correctly? Used by KeyMayExist + bool* merge_in_progress; + std::string* value; + SequenceNumber seq; + const MergeOperator* merge_operator; + // the merge operations encountered; + MergeContext* merge_context; + SequenceNumber max_covering_tombstone_seq; + MemTable* mem; + Logger* logger; + Statistics* statistics; + bool inplace_update_support; + bool do_merge; + Env* env_; + ReadCallback* callback_; + bool* is_blob_index; + + bool CheckCallback(SequenceNumber _seq) { + if (callback_) { + return callback_->IsVisible(_seq); + } + return true; + } +}; +} // namespace + +static bool SaveValue(void* arg, const char* entry) { + Saver* s = reinterpret_cast(arg); + assert(s != nullptr); + MergeContext* merge_context = s->merge_context; + SequenceNumber max_covering_tombstone_seq = s->max_covering_tombstone_seq; + const MergeOperator* merge_operator = s->merge_operator; + + assert(merge_context != nullptr); + + // entry format is: + // klength varint32 + // userkey char[klength-8] + // tag uint64 + // vlength varint32f + // value char[vlength] + // Check that it belongs to same user key. We do not check the + // sequence number since the Seek() call above should have skipped + // all entries with overly large sequence numbers. + uint32_t key_length; + const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); + Slice user_key_slice = Slice(key_ptr, key_length - 8); + if (s->mem->GetInternalKeyComparator() + .user_comparator() + ->CompareWithoutTimestamp(user_key_slice, s->key->user_key()) == 0) { + // Correct user key + const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8); + ValueType type; + SequenceNumber seq; + UnPackSequenceAndType(tag, &seq, &type); + // If the value is not in the snapshot, skip it + if (!s->CheckCallback(seq)) { + return true; // to continue to the next seq + } + + s->seq = seq; + + if ((type == kTypeValue || type == kTypeMerge || type == kTypeBlobIndex) && + max_covering_tombstone_seq > seq) { + type = kTypeRangeDeletion; + } + switch (type) { + case kTypeBlobIndex: + if (s->is_blob_index == nullptr) { + ROCKS_LOG_ERROR(s->logger, "Encounter unexpected blob index."); + *(s->status) = Status::NotSupported( + "Encounter unsupported blob value. Please open DB with " + "ROCKSDB_NAMESPACE::blob_db::BlobDB instead."); + } else if (*(s->merge_in_progress)) { + *(s->status) = + Status::NotSupported("Blob DB does not support merge operator."); + } + if (!s->status->ok()) { + *(s->found_final_value) = true; + return false; + } + FALLTHROUGH_INTENDED; + case kTypeValue: { + if (s->inplace_update_support) { + s->mem->GetLock(s->key->user_key())->ReadLock(); + } + Slice v = GetLengthPrefixedSlice(key_ptr + key_length); + *(s->status) = Status::OK(); + if (*(s->merge_in_progress)) { + if (s->do_merge) { + if (s->value != nullptr) { + *(s->status) = MergeHelper::TimedFullMerge( + merge_operator, s->key->user_key(), &v, + merge_context->GetOperands(), s->value, s->logger, + s->statistics, s->env_, nullptr /* result_operand */, true); + } + } else { + // Preserve the value with the goal of returning it as part of + // raw merge operands to the user + merge_context->PushOperand( + v, s->inplace_update_support == false /* operand_pinned */); + } + } else if (!s->do_merge) { + // Preserve the value with the goal of returning it as part of + // raw merge operands to the user + merge_context->PushOperand( + v, s->inplace_update_support == false /* operand_pinned */); + } else if (s->value != nullptr) { + s->value->assign(v.data(), v.size()); + } + if (s->inplace_update_support) { + s->mem->GetLock(s->key->user_key())->ReadUnlock(); + } + *(s->found_final_value) = true; + if (s->is_blob_index != nullptr) { + *(s->is_blob_index) = (type == kTypeBlobIndex); + } + return false; + } + case kTypeDeletion: + case kTypeSingleDeletion: + case kTypeRangeDeletion: { + if (*(s->merge_in_progress)) { + if (s->value != nullptr) { + *(s->status) = MergeHelper::TimedFullMerge( + merge_operator, s->key->user_key(), nullptr, + merge_context->GetOperands(), s->value, s->logger, + s->statistics, s->env_, nullptr /* result_operand */, true); + } + } else { + *(s->status) = Status::NotFound(); + } + *(s->found_final_value) = true; + return false; + } + case kTypeMerge: { + if (!merge_operator) { + *(s->status) = Status::InvalidArgument( + "merge_operator is not properly initialized."); + // Normally we continue the loop (return true) when we see a merge + // operand. But in case of an error, we should stop the loop + // immediately and pretend we have found the value to stop further + // seek. Otherwise, the later call will override this error status. + *(s->found_final_value) = true; + return false; + } + Slice v = GetLengthPrefixedSlice(key_ptr + key_length); + *(s->merge_in_progress) = true; + merge_context->PushOperand( + v, s->inplace_update_support == false /* operand_pinned */); + if (s->do_merge && merge_operator->ShouldMerge( + merge_context->GetOperandsDirectionBackward())) { + *(s->status) = MergeHelper::TimedFullMerge( + merge_operator, s->key->user_key(), nullptr, + merge_context->GetOperands(), s->value, s->logger, s->statistics, + s->env_, nullptr /* result_operand */, true); + *(s->found_final_value) = true; + return false; + } + return true; + } + default: + assert(false); + return true; + } + } + + // s->state could be Corrupt, merge or notfound + return false; +} + +bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, + MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, + SequenceNumber* seq, const ReadOptions& read_opts, + ReadCallback* callback, bool* is_blob_index, bool do_merge) { + // The sequence number is updated synchronously in version_set.h + if (IsEmpty()) { + // Avoiding recording stats for speed. + return false; + } + PERF_TIMER_GUARD(get_from_memtable_time); + + std::unique_ptr range_del_iter( + NewRangeTombstoneIterator(read_opts, + GetInternalKeySeqno(key.internal_key()))); + if (range_del_iter != nullptr) { + *max_covering_tombstone_seq = + std::max(*max_covering_tombstone_seq, + range_del_iter->MaxCoveringTombstoneSeqnum(key.user_key())); + } + + Slice user_key = key.user_key(); + bool found_final_value = false; + bool merge_in_progress = s->IsMergeInProgress(); + bool may_contain = true; + size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size(); + if (bloom_filter_) { + // when both memtable_whole_key_filtering and prefix_extractor_ are set, + // only do whole key filtering for Get() to save CPU + if (moptions_.memtable_whole_key_filtering) { + may_contain = + bloom_filter_->MayContain(StripTimestampFromUserKey(user_key, ts_sz)); + } else { + assert(prefix_extractor_); + may_contain = + !prefix_extractor_->InDomain(user_key) || + bloom_filter_->MayContain(prefix_extractor_->Transform(user_key)); + } + } + + if (bloom_filter_ && !may_contain) { + // iter is null if prefix bloom says the key does not exist + PERF_COUNTER_ADD(bloom_memtable_miss_count, 1); + *seq = kMaxSequenceNumber; + } else { + if (bloom_filter_) { + PERF_COUNTER_ADD(bloom_memtable_hit_count, 1); + } + GetFromTable(key, *max_covering_tombstone_seq, do_merge, callback, + is_blob_index, value, s, merge_context, seq, + &found_final_value, &merge_in_progress); + } + + // No change to value, since we have not yet found a Put/Delete + if (!found_final_value && merge_in_progress) { + *s = Status::MergeInProgress(); + } + PERF_COUNTER_ADD(get_from_memtable_count, 1); + return found_final_value; +} + +void MemTable::GetFromTable(const LookupKey& key, + SequenceNumber max_covering_tombstone_seq, + bool do_merge, ReadCallback* callback, + bool* is_blob_index, std::string* value, Status* s, + MergeContext* merge_context, SequenceNumber* seq, + bool* found_final_value, bool* merge_in_progress) { + Saver saver; + saver.status = s; + saver.found_final_value = found_final_value; + saver.merge_in_progress = merge_in_progress; + saver.key = &key; + saver.value = value; + saver.seq = kMaxSequenceNumber; + saver.mem = this; + saver.merge_context = merge_context; + saver.max_covering_tombstone_seq = max_covering_tombstone_seq; + saver.merge_operator = moptions_.merge_operator; + saver.logger = moptions_.info_log; + saver.inplace_update_support = moptions_.inplace_update_support; + saver.statistics = moptions_.statistics; + saver.env_ = env_; + saver.callback_ = callback; + saver.is_blob_index = is_blob_index; + saver.do_merge = do_merge; + table_->Get(key, &saver, SaveValue); + *seq = saver.seq; +} + +void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range, + ReadCallback* callback, bool* is_blob) { + // The sequence number is updated synchronously in version_set.h + if (IsEmpty()) { + // Avoiding recording stats for speed. + return; + } + PERF_TIMER_GUARD(get_from_memtable_time); + + MultiGetRange temp_range(*range, range->begin(), range->end()); + if (bloom_filter_) { + std::array keys; + std::array may_match = {{true}}; + autovector prefixes; + int num_keys = 0; + for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) { + if (!prefix_extractor_) { + keys[num_keys++] = &iter->ukey; + } else if (prefix_extractor_->InDomain(iter->ukey)) { + prefixes.emplace_back(prefix_extractor_->Transform(iter->ukey)); + keys[num_keys++] = &prefixes.back(); + } + } + bloom_filter_->MayContain(num_keys, &keys[0], &may_match[0]); + int idx = 0; + for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) { + if (prefix_extractor_ && !prefix_extractor_->InDomain(iter->ukey)) { + PERF_COUNTER_ADD(bloom_memtable_hit_count, 1); + continue; + } + if (!may_match[idx]) { + temp_range.SkipKey(iter); + PERF_COUNTER_ADD(bloom_memtable_miss_count, 1); + } else { + PERF_COUNTER_ADD(bloom_memtable_hit_count, 1); + } + idx++; + } + } + for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) { + SequenceNumber seq = kMaxSequenceNumber; + bool found_final_value{false}; + bool merge_in_progress = iter->s->IsMergeInProgress(); + std::unique_ptr range_del_iter( + NewRangeTombstoneIterator( + read_options, GetInternalKeySeqno(iter->lkey->internal_key()))); + if (range_del_iter != nullptr) { + iter->max_covering_tombstone_seq = std::max( + iter->max_covering_tombstone_seq, + range_del_iter->MaxCoveringTombstoneSeqnum(iter->lkey->user_key())); + } + GetFromTable(*(iter->lkey), iter->max_covering_tombstone_seq, true, + callback, is_blob, iter->value->GetSelf(), iter->s, + &(iter->merge_context), &seq, &found_final_value, + &merge_in_progress); + + if (!found_final_value && merge_in_progress) { + *(iter->s) = Status::MergeInProgress(); + } + + if (found_final_value) { + iter->value->PinSelf(); + range->MarkKeyDone(iter); + RecordTick(moptions_.statistics, MEMTABLE_HIT); + } + } + PERF_COUNTER_ADD(get_from_memtable_count, 1); +} + +void MemTable::Update(SequenceNumber seq, + const Slice& key, + const Slice& value) { + LookupKey lkey(key, seq); + Slice mem_key = lkey.memtable_key(); + + std::unique_ptr iter( + table_->GetDynamicPrefixIterator()); + iter->Seek(lkey.internal_key(), mem_key.data()); + + if (iter->Valid()) { + // entry format is: + // key_length varint32 + // userkey char[klength-8] + // tag uint64 + // vlength varint32 + // value char[vlength] + // Check that it belongs to same user key. We do not check the + // sequence number since the Seek() call above should have skipped + // all entries with overly large sequence numbers. + const char* entry = iter->key(); + uint32_t key_length = 0; + const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); + if (comparator_.comparator.user_comparator()->Equal( + Slice(key_ptr, key_length - 8), lkey.user_key())) { + // Correct user key + const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8); + ValueType type; + SequenceNumber existing_seq; + UnPackSequenceAndType(tag, &existing_seq, &type); + assert(existing_seq != seq); + if (type == kTypeValue) { + Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length); + uint32_t prev_size = static_cast(prev_value.size()); + uint32_t new_size = static_cast(value.size()); + + // Update value, if new value size <= previous value size + if (new_size <= prev_size) { + char* p = + EncodeVarint32(const_cast(key_ptr) + key_length, new_size); + WriteLock wl(GetLock(lkey.user_key())); + memcpy(p, value.data(), value.size()); + assert((unsigned)((p + value.size()) - entry) == + (unsigned)(VarintLength(key_length) + key_length + + VarintLength(value.size()) + value.size())); + RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED); + return; + } + } + } + } + + // key doesn't exist + bool add_res __attribute__((__unused__)); + add_res = Add(seq, kTypeValue, key, value); + // We already checked unused != seq above. In that case, Add should not fail. + assert(add_res); +} + +bool MemTable::UpdateCallback(SequenceNumber seq, + const Slice& key, + const Slice& delta) { + LookupKey lkey(key, seq); + Slice memkey = lkey.memtable_key(); + + std::unique_ptr iter( + table_->GetDynamicPrefixIterator()); + iter->Seek(lkey.internal_key(), memkey.data()); + + if (iter->Valid()) { + // entry format is: + // key_length varint32 + // userkey char[klength-8] + // tag uint64 + // vlength varint32 + // value char[vlength] + // Check that it belongs to same user key. We do not check the + // sequence number since the Seek() call above should have skipped + // all entries with overly large sequence numbers. + const char* entry = iter->key(); + uint32_t key_length = 0; + const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); + if (comparator_.comparator.user_comparator()->Equal( + Slice(key_ptr, key_length - 8), lkey.user_key())) { + // Correct user key + const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8); + ValueType type; + uint64_t unused; + UnPackSequenceAndType(tag, &unused, &type); + switch (type) { + case kTypeValue: { + Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length); + uint32_t prev_size = static_cast(prev_value.size()); + + char* prev_buffer = const_cast(prev_value.data()); + uint32_t new_prev_size = prev_size; + + std::string str_value; + WriteLock wl(GetLock(lkey.user_key())); + auto status = moptions_.inplace_callback(prev_buffer, &new_prev_size, + delta, &str_value); + if (status == UpdateStatus::UPDATED_INPLACE) { + // Value already updated by callback. + assert(new_prev_size <= prev_size); + if (new_prev_size < prev_size) { + // overwrite the new prev_size + char* p = EncodeVarint32(const_cast(key_ptr) + key_length, + new_prev_size); + if (VarintLength(new_prev_size) < VarintLength(prev_size)) { + // shift the value buffer as well. + memcpy(p, prev_buffer, new_prev_size); + } + } + RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED); + UpdateFlushState(); + return true; + } else if (status == UpdateStatus::UPDATED) { + Add(seq, kTypeValue, key, Slice(str_value)); + RecordTick(moptions_.statistics, NUMBER_KEYS_WRITTEN); + UpdateFlushState(); + return true; + } else if (status == UpdateStatus::UPDATE_FAILED) { + // No action required. Return. + UpdateFlushState(); + return true; + } + } + default: + break; + } + } + } + // If the latest value is not kTypeValue + // or key doesn't exist + return false; +} + +size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) { + Slice memkey = key.memtable_key(); + + // A total ordered iterator is costly for some memtablerep (prefix aware + // reps). By passing in the user key, we allow efficient iterator creation. + // The iterator only needs to be ordered within the same user key. + std::unique_ptr iter( + table_->GetDynamicPrefixIterator()); + iter->Seek(key.internal_key(), memkey.data()); + + size_t num_successive_merges = 0; + + for (; iter->Valid(); iter->Next()) { + const char* entry = iter->key(); + uint32_t key_length = 0; + const char* iter_key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); + if (!comparator_.comparator.user_comparator()->Equal( + Slice(iter_key_ptr, key_length - 8), key.user_key())) { + break; + } + + const uint64_t tag = DecodeFixed64(iter_key_ptr + key_length - 8); + ValueType type; + uint64_t unused; + UnPackSequenceAndType(tag, &unused, &type); + if (type != kTypeMerge) { + break; + } + + ++num_successive_merges; + } + + return num_successive_merges; +} + +void MemTableRep::Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, const char* entry)) { + auto iter = GetDynamicPrefixIterator(); + for (iter->Seek(k.internal_key(), k.memtable_key().data()); + iter->Valid() && callback_func(callback_args, iter->key()); + iter->Next()) { + } +} + +void MemTable::RefLogContainingPrepSection(uint64_t log) { + assert(log > 0); + auto cur = min_prep_log_referenced_.load(); + while ((log < cur || cur == 0) && + !min_prep_log_referenced_.compare_exchange_strong(cur, log)) { + cur = min_prep_log_referenced_.load(); + } +} + +uint64_t MemTable::GetMinLogContainingPrepSection() { + return min_prep_log_referenced_.load(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/memtable.h b/src/rocksdb/db/memtable.h new file mode 100644 index 000000000..f4e4b98a9 --- /dev/null +++ b/src/rocksdb/db/memtable.h @@ -0,0 +1,542 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include "db/dbformat.h" +#include "db/range_tombstone_fragmenter.h" +#include "db/read_callback.h" +#include "db/version_edit.h" +#include "memory/allocator.h" +#include "memory/concurrent_arena.h" +#include "monitoring/instrumented_mutex.h" +#include "options/cf_options.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/memtablerep.h" +#include "table/multiget_context.h" +#include "util/dynamic_bloom.h" +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { + +struct FlushJobInfo; +class Mutex; +class MemTableIterator; +class MergeContext; + +struct ImmutableMemTableOptions { + explicit ImmutableMemTableOptions(const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options); + size_t arena_block_size; + uint32_t memtable_prefix_bloom_bits; + size_t memtable_huge_page_size; + bool memtable_whole_key_filtering; + bool inplace_update_support; + size_t inplace_update_num_locks; + UpdateStatus (*inplace_callback)(char* existing_value, + uint32_t* existing_value_size, + Slice delta_value, + std::string* merged_value); + size_t max_successive_merges; + Statistics* statistics; + MergeOperator* merge_operator; + Logger* info_log; +}; + +// Batched counters to updated when inserting keys in one write batch. +// In post process of the write batch, these can be updated together. +// Only used in concurrent memtable insert case. +struct MemTablePostProcessInfo { + uint64_t data_size = 0; + uint64_t num_entries = 0; + uint64_t num_deletes = 0; +}; + +using MultiGetRange = MultiGetContext::Range; +// Note: Many of the methods in this class have comments indicating that +// external synchronization is required as these methods are not thread-safe. +// It is up to higher layers of code to decide how to prevent concurrent +// invokation of these methods. This is usually done by acquiring either +// the db mutex or the single writer thread. +// +// Some of these methods are documented to only require external +// synchronization if this memtable is immutable. Calling MarkImmutable() is +// not sufficient to guarantee immutability. It is up to higher layers of +// code to determine if this MemTable can still be modified by other threads. +// Eg: The Superversion stores a pointer to the current MemTable (that can +// be modified) and a separate list of the MemTables that can no longer be +// written to (aka the 'immutable memtables'). +class MemTable { + public: + struct KeyComparator : public MemTableRep::KeyComparator { + const InternalKeyComparator comparator; + explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { } + virtual int operator()(const char* prefix_len_key1, + const char* prefix_len_key2) const override; + virtual int operator()(const char* prefix_len_key, + const DecodedType& key) const override; + }; + + // MemTables are reference counted. The initial reference count + // is zero and the caller must call Ref() at least once. + // + // earliest_seq should be the current SequenceNumber in the db such that any + // key inserted into this memtable will have an equal or larger seq number. + // (When a db is first created, the earliest sequence number will be 0). + // If the earliest sequence number is not known, kMaxSequenceNumber may be + // used, but this may prevent some transactions from succeeding until the + // first key is inserted into the memtable. + explicit MemTable(const InternalKeyComparator& comparator, + const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options, + WriteBufferManager* write_buffer_manager, + SequenceNumber earliest_seq, uint32_t column_family_id); + // No copying allowed + MemTable(const MemTable&) = delete; + MemTable& operator=(const MemTable&) = delete; + + // Do not delete this MemTable unless Unref() indicates it not in use. + ~MemTable(); + + // Increase reference count. + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable. + void Ref() { ++refs_; } + + // Drop reference count. + // If the refcount goes to zero return this memtable, otherwise return null. + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable. + MemTable* Unref() { + --refs_; + assert(refs_ >= 0); + if (refs_ <= 0) { + return this; + } + return nullptr; + } + + // Returns an estimate of the number of bytes of data in use by this + // data structure. + // + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable (unless this Memtable is immutable). + size_t ApproximateMemoryUsage(); + + // As a cheap version of `ApproximateMemoryUsage()`, this function doens't + // require external synchronization. The value may be less accurate though + size_t ApproximateMemoryUsageFast() const { + return approximate_memory_usage_.load(std::memory_order_relaxed); + } + + // This method heuristically determines if the memtable should continue to + // host more data. + bool ShouldScheduleFlush() const { + return flush_state_.load(std::memory_order_relaxed) == FLUSH_REQUESTED; + } + + // Returns true if a flush should be scheduled and the caller should + // be the one to schedule it + bool MarkFlushScheduled() { + auto before = FLUSH_REQUESTED; + return flush_state_.compare_exchange_strong(before, FLUSH_SCHEDULED, + std::memory_order_relaxed, + std::memory_order_relaxed); + } + + // Return an iterator that yields the contents of the memtable. + // + // The caller must ensure that the underlying MemTable remains live + // while the returned iterator is live. The keys returned by this + // iterator are internal keys encoded by AppendInternalKey in the + // db/dbformat.{h,cc} module. + // + // By default, it returns an iterator for prefix seek if prefix_extractor + // is configured in Options. + // arena: If not null, the arena needs to be used to allocate the Iterator. + // Calling ~Iterator of the iterator will destroy all the states but + // those allocated in arena. + InternalIterator* NewIterator(const ReadOptions& read_options, Arena* arena); + + FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator( + const ReadOptions& read_options, SequenceNumber read_seq); + + // Add an entry into memtable that maps key to value at the + // specified sequence number and with the specified type. + // Typically value will be empty if type==kTypeDeletion. + // + // REQUIRES: if allow_concurrent = false, external synchronization to prevent + // simultaneous operations on the same MemTable. + // + // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and + // the already exists. + bool Add(SequenceNumber seq, ValueType type, const Slice& key, + const Slice& value, bool allow_concurrent = false, + MemTablePostProcessInfo* post_process_info = nullptr, + void** hint = nullptr); + + // Used to Get value associated with key or Get Merge Operands associated + // with key. + // If do_merge = true the default behavior which is Get value for key is + // executed. Expected behavior is described right below. + // If memtable contains a value for key, store it in *value and return true. + // If memtable contains a deletion for key, store a NotFound() error + // in *status and return true. + // If memtable contains Merge operation as the most recent entry for a key, + // and the merge process does not stop (not reaching a value or delete), + // prepend the current merge operand to *operands. + // store MergeInProgress in s, and return false. + // Else, return false. + // If any operation was found, its most recent sequence number + // will be stored in *seq on success (regardless of whether true/false is + // returned). Otherwise, *seq will be set to kMaxSequenceNumber. + // On success, *s may be set to OK, NotFound, or MergeInProgress. Any other + // status returned indicates a corruption or other unexpected error. + // If do_merge = false then any Merge Operands encountered for key are simply + // stored in merge_context.operands_list and never actually merged to get a + // final value. The raw Merge Operands are eventually returned to the user. + bool Get(const LookupKey& key, std::string* value, Status* s, + MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq, + const ReadOptions& read_opts, ReadCallback* callback = nullptr, + bool* is_blob_index = nullptr, bool do_merge = true); + + bool Get(const LookupKey& key, std::string* value, Status* s, + MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, + const ReadOptions& read_opts, ReadCallback* callback = nullptr, + bool* is_blob_index = nullptr, bool do_merge = true) { + SequenceNumber seq; + return Get(key, value, s, merge_context, max_covering_tombstone_seq, &seq, + read_opts, callback, is_blob_index, do_merge); + } + + void MultiGet(const ReadOptions& read_options, MultiGetRange* range, + ReadCallback* callback, bool* is_blob); + + // Attempts to update the new_value inplace, else does normal Add + // Pseudocode + // if key exists in current memtable && prev_value is of type kTypeValue + // if new sizeof(new_value) <= sizeof(prev_value) + // update inplace + // else add(key, new_value) + // else add(key, new_value) + // + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable. + void Update(SequenceNumber seq, + const Slice& key, + const Slice& value); + + // If prev_value for key exists, attempts to update it inplace. + // else returns false + // Pseudocode + // if key exists in current memtable && prev_value is of type kTypeValue + // new_value = delta(prev_value) + // if sizeof(new_value) <= sizeof(prev_value) + // update inplace + // else add(key, new_value) + // else return false + // + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable. + bool UpdateCallback(SequenceNumber seq, + const Slice& key, + const Slice& delta); + + // Returns the number of successive merge entries starting from the newest + // entry for the key up to the last non-merge entry or last entry for the + // key in the memtable. + size_t CountSuccessiveMergeEntries(const LookupKey& key); + + // Update counters and flush status after inserting a whole write batch + // Used in concurrent memtable inserts. + void BatchPostProcess(const MemTablePostProcessInfo& update_counters) { + num_entries_.fetch_add(update_counters.num_entries, + std::memory_order_relaxed); + data_size_.fetch_add(update_counters.data_size, std::memory_order_relaxed); + if (update_counters.num_deletes != 0) { + num_deletes_.fetch_add(update_counters.num_deletes, + std::memory_order_relaxed); + } + UpdateFlushState(); + } + + // Get total number of entries in the mem table. + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable (unless this Memtable is immutable). + uint64_t num_entries() const { + return num_entries_.load(std::memory_order_relaxed); + } + + // Get total number of deletes in the mem table. + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable (unless this Memtable is immutable). + uint64_t num_deletes() const { + return num_deletes_.load(std::memory_order_relaxed); + } + + uint64_t get_data_size() const { + return data_size_.load(std::memory_order_relaxed); + } + + // Dynamically change the memtable's capacity. If set below the current usage, + // the next key added will trigger a flush. Can only increase size when + // memtable prefix bloom is disabled, since we can't easily allocate more + // space. + void UpdateWriteBufferSize(size_t new_write_buffer_size) { + if (bloom_filter_ == nullptr || + new_write_buffer_size < write_buffer_size_) { + write_buffer_size_.store(new_write_buffer_size, + std::memory_order_relaxed); + } + } + + // Returns the edits area that is needed for flushing the memtable + VersionEdit* GetEdits() { return &edit_; } + + // Returns if there is no entry inserted to the mem table. + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable (unless this Memtable is immutable). + bool IsEmpty() const { return first_seqno_ == 0; } + + // Returns the sequence number of the first element that was inserted + // into the memtable. + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable (unless this Memtable is immutable). + SequenceNumber GetFirstSequenceNumber() { + return first_seqno_.load(std::memory_order_relaxed); + } + + // Returns the sequence number that is guaranteed to be smaller than or equal + // to the sequence number of any key that could be inserted into this + // memtable. It can then be assumed that any write with a larger(or equal) + // sequence number will be present in this memtable or a later memtable. + // + // If the earliest sequence number could not be determined, + // kMaxSequenceNumber will be returned. + SequenceNumber GetEarliestSequenceNumber() { + return earliest_seqno_.load(std::memory_order_relaxed); + } + + // DB's latest sequence ID when the memtable is created. This number + // may be updated to a more recent one before any key is inserted. + SequenceNumber GetCreationSeq() const { return creation_seq_; } + + void SetCreationSeq(SequenceNumber sn) { creation_seq_ = sn; } + + // Returns the next active logfile number when this memtable is about to + // be flushed to storage + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable. + uint64_t GetNextLogNumber() { return mem_next_logfile_number_; } + + // Sets the next active logfile number when this memtable is about to + // be flushed to storage + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable. + void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; } + + // if this memtable contains data from a committed + // two phase transaction we must take note of the + // log which contains that data so we can know + // when to relese that log + void RefLogContainingPrepSection(uint64_t log); + uint64_t GetMinLogContainingPrepSection(); + + // Notify the underlying storage that no more items will be added. + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable. + // After MarkImmutable() is called, you should not attempt to + // write anything to this MemTable(). (Ie. do not call Add() or Update()). + void MarkImmutable() { + table_->MarkReadOnly(); + mem_tracker_.DoneAllocating(); + } + + // Notify the underlying storage that all data it contained has been + // persisted. + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable. + void MarkFlushed() { + table_->MarkFlushed(); + } + + // return true if the current MemTableRep supports merge operator. + bool IsMergeOperatorSupported() const { + return table_->IsMergeOperatorSupported(); + } + + // return true if the current MemTableRep supports snapshots. + // inplace update prevents snapshots, + bool IsSnapshotSupported() const { + return table_->IsSnapshotSupported() && !moptions_.inplace_update_support; + } + + struct MemTableStats { + uint64_t size; + uint64_t count; + }; + + MemTableStats ApproximateStats(const Slice& start_ikey, + const Slice& end_ikey); + + // Get the lock associated for the key + port::RWMutex* GetLock(const Slice& key); + + const InternalKeyComparator& GetInternalKeyComparator() const { + return comparator_.comparator; + } + + const ImmutableMemTableOptions* GetImmutableMemTableOptions() const { + return &moptions_; + } + + uint64_t ApproximateOldestKeyTime() const { + return oldest_key_time_.load(std::memory_order_relaxed); + } + + // REQUIRES: db_mutex held. + void SetID(uint64_t id) { id_ = id; } + + uint64_t GetID() const { return id_; } + + void SetFlushCompleted(bool completed) { flush_completed_ = completed; } + + uint64_t GetFileNumber() const { return file_number_; } + + void SetFileNumber(uint64_t file_num) { file_number_ = file_num; } + + void SetFlushInProgress(bool in_progress) { + flush_in_progress_ = in_progress; + } + +#ifndef ROCKSDB_LITE + void SetFlushJobInfo(std::unique_ptr&& info) { + flush_job_info_ = std::move(info); + } + + std::unique_ptr ReleaseFlushJobInfo() { + return std::move(flush_job_info_); + } +#endif // !ROCKSDB_LITE + + private: + enum FlushStateEnum { FLUSH_NOT_REQUESTED, FLUSH_REQUESTED, FLUSH_SCHEDULED }; + + friend class MemTableIterator; + friend class MemTableBackwardIterator; + friend class MemTableList; + + KeyComparator comparator_; + const ImmutableMemTableOptions moptions_; + int refs_; + const size_t kArenaBlockSize; + AllocTracker mem_tracker_; + ConcurrentArena arena_; + std::unique_ptr table_; + std::unique_ptr range_del_table_; + std::atomic_bool is_range_del_table_empty_; + + // Total data size of all data inserted + std::atomic data_size_; + std::atomic num_entries_; + std::atomic num_deletes_; + + // Dynamically changeable memtable option + std::atomic write_buffer_size_; + + // These are used to manage memtable flushes to storage + bool flush_in_progress_; // started the flush + bool flush_completed_; // finished the flush + uint64_t file_number_; // filled up after flush is complete + + // The updates to be applied to the transaction log when this + // memtable is flushed to storage. + VersionEdit edit_; + + // The sequence number of the kv that was inserted first + std::atomic first_seqno_; + + // The db sequence number at the time of creation or kMaxSequenceNumber + // if not set. + std::atomic earliest_seqno_; + + SequenceNumber creation_seq_; + + // The log files earlier than this number can be deleted. + uint64_t mem_next_logfile_number_; + + // the earliest log containing a prepared section + // which has been inserted into this memtable. + std::atomic min_prep_log_referenced_; + + // rw locks for inplace updates + std::vector locks_; + + const SliceTransform* const prefix_extractor_; + std::unique_ptr bloom_filter_; + + std::atomic flush_state_; + + Env* env_; + + // Extract sequential insert prefixes. + const SliceTransform* insert_with_hint_prefix_extractor_; + + // Insert hints for each prefix. + std::unordered_map insert_hints_; + + // Timestamp of oldest key + std::atomic oldest_key_time_; + + // Memtable id to track flush. + uint64_t id_ = 0; + + // Sequence number of the atomic flush that is responsible for this memtable. + // The sequence number of atomic flush is a seq, such that no writes with + // sequence numbers greater than or equal to seq are flushed, while all + // writes with sequence number smaller than seq are flushed. + SequenceNumber atomic_flush_seqno_; + + // keep track of memory usage in table_, arena_, and range_del_table_. + // Gets refrshed inside `ApproximateMemoryUsage()` or `ShouldFlushNow` + std::atomic approximate_memory_usage_; + +#ifndef ROCKSDB_LITE + // Flush job info of the current memtable. + std::unique_ptr flush_job_info_; +#endif // !ROCKSDB_LITE + + // Returns a heuristic flush decision + bool ShouldFlushNow(); + + // Updates flush_state_ using ShouldFlushNow() + void UpdateFlushState(); + + void UpdateOldestKeyTime(); + + void GetFromTable(const LookupKey& key, + SequenceNumber max_covering_tombstone_seq, bool do_merge, + ReadCallback* callback, bool* is_blob_index, + std::string* value, Status* s, MergeContext* merge_context, + SequenceNumber* seq, bool* found_final_value, + bool* merge_in_progress); +}; + +extern const char* EncodeKey(std::string* scratch, const Slice& target); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/memtable_list.cc b/src/rocksdb/db/memtable_list.cc new file mode 100644 index 000000000..a8b358fa6 --- /dev/null +++ b/src/rocksdb/db/memtable_list.cc @@ -0,0 +1,771 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "db/memtable_list.h" + +#include +#include +#include +#include +#include "db/db_impl/db_impl.h" +#include "db/memtable.h" +#include "db/range_tombstone_fragmenter.h" +#include "db/version_set.h" +#include "logging/log_buffer.h" +#include "monitoring/thread_status_util.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "table/merging_iterator.h" +#include "test_util/sync_point.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +class InternalKeyComparator; +class Mutex; +class VersionSet; + +void MemTableListVersion::AddMemTable(MemTable* m) { + memlist_.push_front(m); + *parent_memtable_list_memory_usage_ += m->ApproximateMemoryUsage(); +} + +void MemTableListVersion::UnrefMemTable(autovector* to_delete, + MemTable* m) { + if (m->Unref()) { + to_delete->push_back(m); + assert(*parent_memtable_list_memory_usage_ >= m->ApproximateMemoryUsage()); + *parent_memtable_list_memory_usage_ -= m->ApproximateMemoryUsage(); + } +} + +MemTableListVersion::MemTableListVersion( + size_t* parent_memtable_list_memory_usage, MemTableListVersion* old) + : max_write_buffer_number_to_maintain_( + old->max_write_buffer_number_to_maintain_), + max_write_buffer_size_to_maintain_( + old->max_write_buffer_size_to_maintain_), + parent_memtable_list_memory_usage_(parent_memtable_list_memory_usage) { + if (old != nullptr) { + memlist_ = old->memlist_; + for (auto& m : memlist_) { + m->Ref(); + } + + memlist_history_ = old->memlist_history_; + for (auto& m : memlist_history_) { + m->Ref(); + } + } +} + +MemTableListVersion::MemTableListVersion( + size_t* parent_memtable_list_memory_usage, + int max_write_buffer_number_to_maintain, + int64_t max_write_buffer_size_to_maintain) + : max_write_buffer_number_to_maintain_(max_write_buffer_number_to_maintain), + max_write_buffer_size_to_maintain_(max_write_buffer_size_to_maintain), + parent_memtable_list_memory_usage_(parent_memtable_list_memory_usage) {} + +void MemTableListVersion::Ref() { ++refs_; } + +// called by superversion::clean() +void MemTableListVersion::Unref(autovector* to_delete) { + assert(refs_ >= 1); + --refs_; + if (refs_ == 0) { + // if to_delete is equal to nullptr it means we're confident + // that refs_ will not be zero + assert(to_delete != nullptr); + for (const auto& m : memlist_) { + UnrefMemTable(to_delete, m); + } + for (const auto& m : memlist_history_) { + UnrefMemTable(to_delete, m); + } + delete this; + } +} + +int MemTableList::NumNotFlushed() const { + int size = static_cast(current_->memlist_.size()); + assert(num_flush_not_started_ <= size); + return size; +} + +int MemTableList::NumFlushed() const { + return static_cast(current_->memlist_history_.size()); +} + +// Search all the memtables starting from the most recent one. +// Return the most recent value found, if any. +// Operands stores the list of merge operations to apply, so far. +bool MemTableListVersion::Get(const LookupKey& key, std::string* value, + Status* s, MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, + SequenceNumber* seq, const ReadOptions& read_opts, + ReadCallback* callback, bool* is_blob_index) { + return GetFromList(&memlist_, key, value, s, merge_context, + max_covering_tombstone_seq, seq, read_opts, callback, + is_blob_index); +} + +void MemTableListVersion::MultiGet(const ReadOptions& read_options, + MultiGetRange* range, ReadCallback* callback, + bool* is_blob) { + for (auto memtable : memlist_) { + memtable->MultiGet(read_options, range, callback, is_blob); + if (range->empty()) { + return; + } + } +} + +bool MemTableListVersion::GetMergeOperands( + const LookupKey& key, Status* s, MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, const ReadOptions& read_opts) { + for (MemTable* memtable : memlist_) { + bool done = memtable->Get(key, nullptr, s, merge_context, + max_covering_tombstone_seq, read_opts, nullptr, + nullptr, false); + if (done) { + return true; + } + } + return false; +} + +bool MemTableListVersion::GetFromHistory( + const LookupKey& key, std::string* value, Status* s, + MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, + SequenceNumber* seq, const ReadOptions& read_opts, bool* is_blob_index) { + return GetFromList(&memlist_history_, key, value, s, merge_context, + max_covering_tombstone_seq, seq, read_opts, + nullptr /*read_callback*/, is_blob_index); +} + +bool MemTableListVersion::GetFromList( + std::list* list, const LookupKey& key, std::string* value, + Status* s, MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq, + const ReadOptions& read_opts, ReadCallback* callback, bool* is_blob_index) { + *seq = kMaxSequenceNumber; + + for (auto& memtable : *list) { + SequenceNumber current_seq = kMaxSequenceNumber; + + bool done = + memtable->Get(key, value, s, merge_context, max_covering_tombstone_seq, + ¤t_seq, read_opts, callback, is_blob_index); + if (*seq == kMaxSequenceNumber) { + // Store the most recent sequence number of any operation on this key. + // Since we only care about the most recent change, we only need to + // return the first operation found when searching memtables in + // reverse-chronological order. + // current_seq would be equal to kMaxSequenceNumber if the value was to be + // skipped. This allows seq to be assigned again when the next value is + // read. + *seq = current_seq; + } + + if (done) { + assert(*seq != kMaxSequenceNumber || s->IsNotFound()); + return true; + } + if (!done && !s->ok() && !s->IsMergeInProgress() && !s->IsNotFound()) { + return false; + } + } + return false; +} + +Status MemTableListVersion::AddRangeTombstoneIterators( + const ReadOptions& read_opts, Arena* /*arena*/, + RangeDelAggregator* range_del_agg) { + assert(range_del_agg != nullptr); + // Except for snapshot read, using kMaxSequenceNumber is OK because these + // are immutable memtables. + SequenceNumber read_seq = read_opts.snapshot != nullptr + ? read_opts.snapshot->GetSequenceNumber() + : kMaxSequenceNumber; + for (auto& m : memlist_) { + std::unique_ptr range_del_iter( + m->NewRangeTombstoneIterator(read_opts, read_seq)); + range_del_agg->AddTombstones(std::move(range_del_iter)); + } + return Status::OK(); +} + +void MemTableListVersion::AddIterators( + const ReadOptions& options, std::vector* iterator_list, + Arena* arena) { + for (auto& m : memlist_) { + iterator_list->push_back(m->NewIterator(options, arena)); + } +} + +void MemTableListVersion::AddIterators( + const ReadOptions& options, MergeIteratorBuilder* merge_iter_builder) { + for (auto& m : memlist_) { + merge_iter_builder->AddIterator( + m->NewIterator(options, merge_iter_builder->GetArena())); + } +} + +uint64_t MemTableListVersion::GetTotalNumEntries() const { + uint64_t total_num = 0; + for (auto& m : memlist_) { + total_num += m->num_entries(); + } + return total_num; +} + +MemTable::MemTableStats MemTableListVersion::ApproximateStats( + const Slice& start_ikey, const Slice& end_ikey) { + MemTable::MemTableStats total_stats = {0, 0}; + for (auto& m : memlist_) { + auto mStats = m->ApproximateStats(start_ikey, end_ikey); + total_stats.size += mStats.size; + total_stats.count += mStats.count; + } + return total_stats; +} + +uint64_t MemTableListVersion::GetTotalNumDeletes() const { + uint64_t total_num = 0; + for (auto& m : memlist_) { + total_num += m->num_deletes(); + } + return total_num; +} + +SequenceNumber MemTableListVersion::GetEarliestSequenceNumber( + bool include_history) const { + if (include_history && !memlist_history_.empty()) { + return memlist_history_.back()->GetEarliestSequenceNumber(); + } else if (!memlist_.empty()) { + return memlist_.back()->GetEarliestSequenceNumber(); + } else { + return kMaxSequenceNumber; + } +} + +// caller is responsible for referencing m +void MemTableListVersion::Add(MemTable* m, autovector* to_delete) { + assert(refs_ == 1); // only when refs_ == 1 is MemTableListVersion mutable + AddMemTable(m); + + TrimHistory(to_delete, m->ApproximateMemoryUsage()); +} + +// Removes m from list of memtables not flushed. Caller should NOT Unref m. +void MemTableListVersion::Remove(MemTable* m, + autovector* to_delete) { + assert(refs_ == 1); // only when refs_ == 1 is MemTableListVersion mutable + memlist_.remove(m); + + m->MarkFlushed(); + if (max_write_buffer_size_to_maintain_ > 0 || + max_write_buffer_number_to_maintain_ > 0) { + memlist_history_.push_front(m); + // Unable to get size of mutable memtable at this point, pass 0 to + // TrimHistory as a best effort. + TrimHistory(to_delete, 0); + } else { + UnrefMemTable(to_delete, m); + } +} + +// return the total memory usage assuming the oldest flushed memtable is dropped +size_t MemTableListVersion::ApproximateMemoryUsageExcludingLast() const { + size_t total_memtable_size = 0; + for (auto& memtable : memlist_) { + total_memtable_size += memtable->ApproximateMemoryUsage(); + } + for (auto& memtable : memlist_history_) { + total_memtable_size += memtable->ApproximateMemoryUsage(); + } + if (!memlist_history_.empty()) { + total_memtable_size -= memlist_history_.back()->ApproximateMemoryUsage(); + } + return total_memtable_size; +} + +bool MemTableListVersion::MemtableLimitExceeded(size_t usage) { + if (max_write_buffer_size_to_maintain_ > 0) { + // calculate the total memory usage after dropping the oldest flushed + // memtable, compare with max_write_buffer_size_to_maintain_ to decide + // whether to trim history + return ApproximateMemoryUsageExcludingLast() + usage >= + static_cast(max_write_buffer_size_to_maintain_); + } else if (max_write_buffer_number_to_maintain_ > 0) { + return memlist_.size() + memlist_history_.size() > + static_cast(max_write_buffer_number_to_maintain_); + } else { + return false; + } +} + +// Make sure we don't use up too much space in history +void MemTableListVersion::TrimHistory(autovector* to_delete, + size_t usage) { + while (MemtableLimitExceeded(usage) && !memlist_history_.empty()) { + MemTable* x = memlist_history_.back(); + memlist_history_.pop_back(); + + UnrefMemTable(to_delete, x); + } +} + +// Returns true if there is at least one memtable on which flush has +// not yet started. +bool MemTableList::IsFlushPending() const { + if ((flush_requested_ && num_flush_not_started_ > 0) || + (num_flush_not_started_ >= min_write_buffer_number_to_merge_)) { + assert(imm_flush_needed.load(std::memory_order_relaxed)); + return true; + } + return false; +} + +// Returns the memtables that need to be flushed. +void MemTableList::PickMemtablesToFlush(const uint64_t* max_memtable_id, + autovector* ret) { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_PICK_MEMTABLES_TO_FLUSH); + const auto& memlist = current_->memlist_; + bool atomic_flush = false; + for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) { + MemTable* m = *it; + if (!atomic_flush && m->atomic_flush_seqno_ != kMaxSequenceNumber) { + atomic_flush = true; + } + if (max_memtable_id != nullptr && m->GetID() > *max_memtable_id) { + break; + } + if (!m->flush_in_progress_) { + assert(!m->flush_completed_); + num_flush_not_started_--; + if (num_flush_not_started_ == 0) { + imm_flush_needed.store(false, std::memory_order_release); + } + m->flush_in_progress_ = true; // flushing will start very soon + ret->push_back(m); + } + } + if (!atomic_flush || num_flush_not_started_ == 0) { + flush_requested_ = false; // start-flush request is complete + } +} + +void MemTableList::RollbackMemtableFlush(const autovector& mems, + uint64_t /*file_number*/) { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_MEMTABLE_ROLLBACK); + assert(!mems.empty()); + + // If the flush was not successful, then just reset state. + // Maybe a succeeding attempt to flush will be successful. + for (MemTable* m : mems) { + assert(m->flush_in_progress_); + assert(m->file_number_ == 0); + + m->flush_in_progress_ = false; + m->flush_completed_ = false; + m->edit_.Clear(); + num_flush_not_started_++; + } + imm_flush_needed.store(true, std::memory_order_release); +} + +// Try record a successful flush in the manifest file. It might just return +// Status::OK letting a concurrent flush to do actual the recording.. +Status MemTableList::TryInstallMemtableFlushResults( + ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, + const autovector& mems, LogsWithPrepTracker* prep_tracker, + VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number, + autovector* to_delete, Directory* db_directory, + LogBuffer* log_buffer, + std::list>* committed_flush_jobs_info) { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS); + mu->AssertHeld(); + + // Flush was successful + // Record the status on the memtable object. Either this call or a call by a + // concurrent flush thread will read the status and write it to manifest. + for (size_t i = 0; i < mems.size(); ++i) { + // All the edits are associated with the first memtable of this batch. + assert(i == 0 || mems[i]->GetEdits()->NumEntries() == 0); + + mems[i]->flush_completed_ = true; + mems[i]->file_number_ = file_number; + } + + // if some other thread is already committing, then return + Status s; + if (commit_in_progress_) { + TEST_SYNC_POINT("MemTableList::TryInstallMemtableFlushResults:InProgress"); + return s; + } + + // Only a single thread can be executing this piece of code + commit_in_progress_ = true; + + // Retry until all completed flushes are committed. New flushes can finish + // while the current thread is writing manifest where mutex is released. + while (s.ok()) { + auto& memlist = current_->memlist_; + // The back is the oldest; if flush_completed_ is not set to it, it means + // that we were assigned a more recent memtable. The memtables' flushes must + // be recorded in manifest in order. A concurrent flush thread, who is + // assigned to flush the oldest memtable, will later wake up and does all + // the pending writes to manifest, in order. + if (memlist.empty() || !memlist.back()->flush_completed_) { + break; + } + // scan all memtables from the earliest, and commit those + // (in that order) that have finished flushing. Memtables + // are always committed in the order that they were created. + uint64_t batch_file_number = 0; + size_t batch_count = 0; + autovector edit_list; + autovector memtables_to_flush; + // enumerate from the last (earliest) element to see how many batch finished + for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) { + MemTable* m = *it; + if (!m->flush_completed_) { + break; + } + if (it == memlist.rbegin() || batch_file_number != m->file_number_) { + batch_file_number = m->file_number_; + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 " started", + cfd->GetName().c_str(), m->file_number_); + edit_list.push_back(&m->edit_); + memtables_to_flush.push_back(m); +#ifndef ROCKSDB_LITE + std::unique_ptr info = m->ReleaseFlushJobInfo(); + if (info != nullptr) { + committed_flush_jobs_info->push_back(std::move(info)); + } +#else + (void)committed_flush_jobs_info; +#endif // !ROCKSDB_LITE + } + batch_count++; + } + + // TODO(myabandeh): Not sure how batch_count could be 0 here. + if (batch_count > 0) { + if (vset->db_options()->allow_2pc) { + assert(edit_list.size() > 0); + // We piggyback the information of earliest log file to keep in the + // manifest entry for the last file flushed. + edit_list.back()->SetMinLogNumberToKeep(PrecomputeMinLogNumberToKeep( + vset, *cfd, edit_list, memtables_to_flush, prep_tracker)); + } + + // this can release and reacquire the mutex. + s = vset->LogAndApply(cfd, mutable_cf_options, edit_list, mu, + db_directory); + + // we will be changing the version in the next code path, + // so we better create a new one, since versions are immutable + InstallNewVersion(); + + // All the later memtables that have the same filenum + // are part of the same batch. They can be committed now. + uint64_t mem_id = 1; // how many memtables have been flushed. + + // commit new state only if the column family is NOT dropped. + // The reason is as follows (refer to + // ColumnFamilyTest.FlushAndDropRaceCondition). + // If the column family is dropped, then according to LogAndApply, its + // corresponding flush operation is NOT written to the MANIFEST. This + // means the DB is not aware of the L0 files generated from the flush. + // By committing the new state, we remove the memtable from the memtable + // list. Creating an iterator on this column family will not be able to + // read full data since the memtable is removed, and the DB is not aware + // of the L0 files, causing MergingIterator unable to build child + // iterators. RocksDB contract requires that the iterator can be created + // on a dropped column family, and we must be able to + // read full data as long as column family handle is not deleted, even if + // the column family is dropped. + if (s.ok() && !cfd->IsDropped()) { // commit new state + while (batch_count-- > 0) { + MemTable* m = current_->memlist_.back(); + ROCKS_LOG_BUFFER(log_buffer, "[%s] Level-0 commit table #%" PRIu64 + ": memtable #%" PRIu64 " done", + cfd->GetName().c_str(), m->file_number_, mem_id); + assert(m->file_number_ > 0); + current_->Remove(m, to_delete); + UpdateCachedValuesFromMemTableListVersion(); + ResetTrimHistoryNeeded(); + ++mem_id; + } + } else { + for (auto it = current_->memlist_.rbegin(); batch_count-- > 0; ++it) { + MemTable* m = *it; + // commit failed. setup state so that we can flush again. + ROCKS_LOG_BUFFER(log_buffer, "Level-0 commit table #%" PRIu64 + ": memtable #%" PRIu64 " failed", + m->file_number_, mem_id); + m->flush_completed_ = false; + m->flush_in_progress_ = false; + m->edit_.Clear(); + num_flush_not_started_++; + m->file_number_ = 0; + imm_flush_needed.store(true, std::memory_order_release); + ++mem_id; + } + } + } + } + commit_in_progress_ = false; + return s; +} + +// New memtables are inserted at the front of the list. +void MemTableList::Add(MemTable* m, autovector* to_delete) { + assert(static_cast(current_->memlist_.size()) >= num_flush_not_started_); + InstallNewVersion(); + // this method is used to move mutable memtable into an immutable list. + // since mutable memtable is already refcounted by the DBImpl, + // and when moving to the imutable list we don't unref it, + // we don't have to ref the memtable here. we just take over the + // reference from the DBImpl. + current_->Add(m, to_delete); + m->MarkImmutable(); + num_flush_not_started_++; + if (num_flush_not_started_ == 1) { + imm_flush_needed.store(true, std::memory_order_release); + } + UpdateCachedValuesFromMemTableListVersion(); + ResetTrimHistoryNeeded(); +} + +void MemTableList::TrimHistory(autovector* to_delete, size_t usage) { + InstallNewVersion(); + current_->TrimHistory(to_delete, usage); + UpdateCachedValuesFromMemTableListVersion(); + ResetTrimHistoryNeeded(); +} + +// Returns an estimate of the number of bytes of data in use. +size_t MemTableList::ApproximateUnflushedMemTablesMemoryUsage() { + size_t total_size = 0; + for (auto& memtable : current_->memlist_) { + total_size += memtable->ApproximateMemoryUsage(); + } + return total_size; +} + +size_t MemTableList::ApproximateMemoryUsage() { return current_memory_usage_; } + +size_t MemTableList::ApproximateMemoryUsageExcludingLast() const { + const size_t usage = + current_memory_usage_excluding_last_.load(std::memory_order_relaxed); + return usage; +} + +bool MemTableList::HasHistory() const { + const bool has_history = current_has_history_.load(std::memory_order_relaxed); + return has_history; +} + +void MemTableList::UpdateCachedValuesFromMemTableListVersion() { + const size_t total_memtable_size = + current_->ApproximateMemoryUsageExcludingLast(); + current_memory_usage_excluding_last_.store(total_memtable_size, + std::memory_order_relaxed); + + const bool has_history = current_->HasHistory(); + current_has_history_.store(has_history, std::memory_order_relaxed); +} + +uint64_t MemTableList::ApproximateOldestKeyTime() const { + if (!current_->memlist_.empty()) { + return current_->memlist_.back()->ApproximateOldestKeyTime(); + } + return std::numeric_limits::max(); +} + +void MemTableList::InstallNewVersion() { + if (current_->refs_ == 1) { + // we're the only one using the version, just keep using it + } else { + // somebody else holds the current version, we need to create new one + MemTableListVersion* version = current_; + current_ = new MemTableListVersion(¤t_memory_usage_, current_); + current_->Ref(); + version->Unref(); + } +} + +uint64_t MemTableList::PrecomputeMinLogContainingPrepSection( + const autovector& memtables_to_flush) { + uint64_t min_log = 0; + + for (auto& m : current_->memlist_) { + // Assume the list is very short, we can live with O(m*n). We can optimize + // if the performance has some problem. + bool should_skip = false; + for (MemTable* m_to_flush : memtables_to_flush) { + if (m == m_to_flush) { + should_skip = true; + break; + } + } + if (should_skip) { + continue; + } + + auto log = m->GetMinLogContainingPrepSection(); + + if (log > 0 && (min_log == 0 || log < min_log)) { + min_log = log; + } + } + + return min_log; +} + +// Commit a successful atomic flush in the manifest file. +Status InstallMemtableAtomicFlushResults( + const autovector* imm_lists, + const autovector& cfds, + const autovector& mutable_cf_options_list, + const autovector*>& mems_list, VersionSet* vset, + InstrumentedMutex* mu, const autovector& file_metas, + autovector* to_delete, Directory* db_directory, + LogBuffer* log_buffer) { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS); + mu->AssertHeld(); + + size_t num = mems_list.size(); + assert(cfds.size() == num); + if (imm_lists != nullptr) { + assert(imm_lists->size() == num); + } + for (size_t k = 0; k != num; ++k) { +#ifndef NDEBUG + const auto* imm = + (imm_lists == nullptr) ? cfds[k]->imm() : imm_lists->at(k); + if (!mems_list[k]->empty()) { + assert((*mems_list[k])[0]->GetID() == imm->GetEarliestMemTableID()); + } +#endif + assert(nullptr != file_metas[k]); + for (size_t i = 0; i != mems_list[k]->size(); ++i) { + assert(i == 0 || (*mems_list[k])[i]->GetEdits()->NumEntries() == 0); + (*mems_list[k])[i]->SetFlushCompleted(true); + (*mems_list[k])[i]->SetFileNumber(file_metas[k]->fd.GetNumber()); + } + } + + Status s; + + autovector> edit_lists; + uint32_t num_entries = 0; + for (const auto mems : mems_list) { + assert(mems != nullptr); + autovector edits; + assert(!mems->empty()); + edits.emplace_back((*mems)[0]->GetEdits()); + ++num_entries; + edit_lists.emplace_back(edits); + } + // Mark the version edits as an atomic group if the number of version edits + // exceeds 1. + if (cfds.size() > 1) { + for (auto& edits : edit_lists) { + assert(edits.size() == 1); + edits[0]->MarkAtomicGroup(--num_entries); + } + assert(0 == num_entries); + } + + // this can release and reacquire the mutex. + s = vset->LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu, + db_directory); + + for (size_t k = 0; k != cfds.size(); ++k) { + auto* imm = (imm_lists == nullptr) ? cfds[k]->imm() : imm_lists->at(k); + imm->InstallNewVersion(); + } + + if (s.ok() || s.IsColumnFamilyDropped()) { + for (size_t i = 0; i != cfds.size(); ++i) { + if (cfds[i]->IsDropped()) { + continue; + } + auto* imm = (imm_lists == nullptr) ? cfds[i]->imm() : imm_lists->at(i); + for (auto m : *mems_list[i]) { + assert(m->GetFileNumber() > 0); + uint64_t mem_id = m->GetID(); + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 + ": memtable #%" PRIu64 " done", + cfds[i]->GetName().c_str(), m->GetFileNumber(), + mem_id); + imm->current_->Remove(m, to_delete); + imm->UpdateCachedValuesFromMemTableListVersion(); + imm->ResetTrimHistoryNeeded(); + } + } + } else { + for (size_t i = 0; i != cfds.size(); ++i) { + auto* imm = (imm_lists == nullptr) ? cfds[i]->imm() : imm_lists->at(i); + for (auto m : *mems_list[i]) { + uint64_t mem_id = m->GetID(); + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 + ": memtable #%" PRIu64 " failed", + cfds[i]->GetName().c_str(), m->GetFileNumber(), + mem_id); + m->SetFlushCompleted(false); + m->SetFlushInProgress(false); + m->GetEdits()->Clear(); + m->SetFileNumber(0); + imm->num_flush_not_started_++; + } + imm->imm_flush_needed.store(true, std::memory_order_release); + } + } + + return s; +} + +void MemTableList::RemoveOldMemTables(uint64_t log_number, + autovector* to_delete) { + assert(to_delete != nullptr); + InstallNewVersion(); + auto& memlist = current_->memlist_; + autovector old_memtables; + for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) { + MemTable* mem = *it; + if (mem->GetNextLogNumber() > log_number) { + break; + } + old_memtables.push_back(mem); + } + + for (auto it = old_memtables.begin(); it != old_memtables.end(); ++it) { + MemTable* mem = *it; + current_->Remove(mem, to_delete); + --num_flush_not_started_; + if (0 == num_flush_not_started_) { + imm_flush_needed.store(false, std::memory_order_release); + } + } + + UpdateCachedValuesFromMemTableListVersion(); + ResetTrimHistoryNeeded(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/memtable_list.h b/src/rocksdb/db/memtable_list.h new file mode 100644 index 000000000..a6acf6a32 --- /dev/null +++ b/src/rocksdb/db/memtable_list.h @@ -0,0 +1,422 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "db/logs_with_prep_tracker.h" +#include "db/memtable.h" +#include "db/range_del_aggregator.h" +#include "file/filename.h" +#include "logging/log_buffer.h" +#include "monitoring/instrumented_mutex.h" +#include "rocksdb/db.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "rocksdb/types.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +class ColumnFamilyData; +class InternalKeyComparator; +class InstrumentedMutex; +class MergeIteratorBuilder; +class MemTableList; + +struct FlushJobInfo; + +// keeps a list of immutable memtables in a vector. the list is immutable +// if refcount is bigger than one. It is used as a state for Get() and +// Iterator code paths +// +// This class is not thread-safe. External synchronization is required +// (such as holding the db mutex or being on the write thread). +class MemTableListVersion { + public: + explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage, + MemTableListVersion* old = nullptr); + explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage, + int max_write_buffer_number_to_maintain, + int64_t max_write_buffer_size_to_maintain); + + void Ref(); + void Unref(autovector* to_delete = nullptr); + + // Search all the memtables starting from the most recent one. + // Return the most recent value found, if any. + // + // If any operation was found for this key, its most recent sequence number + // will be stored in *seq on success (regardless of whether true/false is + // returned). Otherwise, *seq will be set to kMaxSequenceNumber. + bool Get(const LookupKey& key, std::string* value, Status* s, + MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq, + const ReadOptions& read_opts, ReadCallback* callback = nullptr, + bool* is_blob_index = nullptr); + + bool Get(const LookupKey& key, std::string* value, Status* s, + MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, + const ReadOptions& read_opts, ReadCallback* callback = nullptr, + bool* is_blob_index = nullptr) { + SequenceNumber seq; + return Get(key, value, s, merge_context, max_covering_tombstone_seq, &seq, + read_opts, callback, is_blob_index); + } + + void MultiGet(const ReadOptions& read_options, MultiGetRange* range, + ReadCallback* callback, bool* is_blob); + + // Returns all the merge operands corresponding to the key by searching all + // memtables starting from the most recent one. + bool GetMergeOperands(const LookupKey& key, Status* s, + MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, + const ReadOptions& read_opts); + + // Similar to Get(), but searches the Memtable history of memtables that + // have already been flushed. Should only be used from in-memory only + // queries (such as Transaction validation) as the history may contain + // writes that are also present in the SST files. + bool GetFromHistory(const LookupKey& key, std::string* value, Status* s, + MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, + SequenceNumber* seq, const ReadOptions& read_opts, + bool* is_blob_index = nullptr); + bool GetFromHistory(const LookupKey& key, std::string* value, Status* s, + MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, + const ReadOptions& read_opts, + bool* is_blob_index = nullptr) { + SequenceNumber seq; + return GetFromHistory(key, value, s, merge_context, + max_covering_tombstone_seq, &seq, read_opts, + is_blob_index); + } + + Status AddRangeTombstoneIterators(const ReadOptions& read_opts, Arena* arena, + RangeDelAggregator* range_del_agg); + + void AddIterators(const ReadOptions& options, + std::vector* iterator_list, + Arena* arena); + + void AddIterators(const ReadOptions& options, + MergeIteratorBuilder* merge_iter_builder); + + uint64_t GetTotalNumEntries() const; + + uint64_t GetTotalNumDeletes() const; + + MemTable::MemTableStats ApproximateStats(const Slice& start_ikey, + const Slice& end_ikey); + + // Returns the value of MemTable::GetEarliestSequenceNumber() on the most + // recent MemTable in this list or kMaxSequenceNumber if the list is empty. + // If include_history=true, will also search Memtables in MemTableList + // History. + SequenceNumber GetEarliestSequenceNumber(bool include_history = false) const; + + private: + friend class MemTableList; + + friend Status InstallMemtableAtomicFlushResults( + const autovector* imm_lists, + const autovector& cfds, + const autovector& mutable_cf_options_list, + const autovector*>& mems_list, + VersionSet* vset, InstrumentedMutex* mu, + const autovector& file_meta, + autovector* to_delete, Directory* db_directory, + LogBuffer* log_buffer); + + // REQUIRE: m is an immutable memtable + void Add(MemTable* m, autovector* to_delete); + // REQUIRE: m is an immutable memtable + void Remove(MemTable* m, autovector* to_delete); + + void TrimHistory(autovector* to_delete, size_t usage); + + bool GetFromList(std::list* list, const LookupKey& key, + std::string* value, Status* s, MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, + SequenceNumber* seq, const ReadOptions& read_opts, + ReadCallback* callback = nullptr, + bool* is_blob_index = nullptr); + + void AddMemTable(MemTable* m); + + void UnrefMemTable(autovector* to_delete, MemTable* m); + + // Calculate the total amount of memory used by memlist_ and memlist_history_ + // excluding the last MemTable in memlist_history_. The reason for excluding + // the last MemTable is to see if dropping the last MemTable will keep total + // memory usage above or equal to max_write_buffer_size_to_maintain_ + size_t ApproximateMemoryUsageExcludingLast() const; + + // Whether this version contains flushed memtables that are only kept around + // for transaction conflict checking. + bool HasHistory() const { return !memlist_history_.empty(); } + + bool MemtableLimitExceeded(size_t usage); + + // Immutable MemTables that have not yet been flushed. + std::list memlist_; + + // MemTables that have already been flushed + // (used during Transaction validation) + std::list memlist_history_; + + // Maximum number of MemTables to keep in memory (including both flushed + const int max_write_buffer_number_to_maintain_; + // Maximum size of MemTables to keep in memory (including both flushed + // and not-yet-flushed tables). + const int64_t max_write_buffer_size_to_maintain_; + + int refs_ = 0; + + size_t* parent_memtable_list_memory_usage_; +}; + +// This class stores references to all the immutable memtables. +// The memtables are flushed to L0 as soon as possible and in +// any order. If there are more than one immutable memtable, their +// flushes can occur concurrently. However, they are 'committed' +// to the manifest in FIFO order to maintain correctness and +// recoverability from a crash. +// +// +// Other than imm_flush_needed and imm_trim_needed, this class is not +// thread-safe and requires external synchronization (such as holding the db +// mutex or being on the write thread.) +class MemTableList { + public: + // A list of memtables. + explicit MemTableList(int min_write_buffer_number_to_merge, + int max_write_buffer_number_to_maintain, + int64_t max_write_buffer_size_to_maintain) + : imm_flush_needed(false), + imm_trim_needed(false), + min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge), + current_(new MemTableListVersion(¤t_memory_usage_, + max_write_buffer_number_to_maintain, + max_write_buffer_size_to_maintain)), + num_flush_not_started_(0), + commit_in_progress_(false), + flush_requested_(false), + current_memory_usage_(0), + current_memory_usage_excluding_last_(0), + current_has_history_(false) { + current_->Ref(); + } + + // Should not delete MemTableList without making sure MemTableList::current() + // is Unref()'d. + ~MemTableList() {} + + MemTableListVersion* current() const { return current_; } + + // so that background threads can detect non-nullptr pointer to + // determine whether there is anything more to start flushing. + std::atomic imm_flush_needed; + + std::atomic imm_trim_needed; + + // Returns the total number of memtables in the list that haven't yet + // been flushed and logged. + int NumNotFlushed() const; + + // Returns total number of memtables in the list that have been + // completely flushed and logged. + int NumFlushed() const; + + // Returns true if there is at least one memtable on which flush has + // not yet started. + bool IsFlushPending() const; + + // Returns the earliest memtables that needs to be flushed. The returned + // memtables are guaranteed to be in the ascending order of created time. + void PickMemtablesToFlush(const uint64_t* max_memtable_id, + autovector* mems); + + // Reset status of the given memtable list back to pending state so that + // they can get picked up again on the next round of flush. + void RollbackMemtableFlush(const autovector& mems, + uint64_t file_number); + + // Try commit a successful flush in the manifest file. It might just return + // Status::OK letting a concurrent flush to do the actual the recording. + Status TryInstallMemtableFlushResults( + ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, + const autovector& m, LogsWithPrepTracker* prep_tracker, + VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number, + autovector* to_delete, Directory* db_directory, + LogBuffer* log_buffer, + std::list>* committed_flush_jobs_info); + + // New memtables are inserted at the front of the list. + // Takes ownership of the referenced held on *m by the caller of Add(). + void Add(MemTable* m, autovector* to_delete); + + // Returns an estimate of the number of bytes of data in use. + size_t ApproximateMemoryUsage(); + + // Returns the cached current_memory_usage_excluding_last_ value. + size_t ApproximateMemoryUsageExcludingLast() const; + + // Returns the cached current_has_history_ value. + bool HasHistory() const; + + // Updates current_memory_usage_excluding_last_ and current_has_history_ + // from MemTableListVersion. Must be called whenever InstallNewVersion is + // called. + void UpdateCachedValuesFromMemTableListVersion(); + + // `usage` is the current size of the mutable Memtable. When + // max_write_buffer_size_to_maintain is used, total size of mutable and + // immutable memtables is checked against it to decide whether to trim + // memtable list. + void TrimHistory(autovector* to_delete, size_t usage); + + // Returns an estimate of the number of bytes of data used by + // the unflushed mem-tables. + size_t ApproximateUnflushedMemTablesMemoryUsage(); + + // Returns an estimate of the timestamp of the earliest key. + uint64_t ApproximateOldestKeyTime() const; + + // Request a flush of all existing memtables to storage. This will + // cause future calls to IsFlushPending() to return true if this list is + // non-empty (regardless of the min_write_buffer_number_to_merge + // parameter). This flush request will persist until the next time + // PickMemtablesToFlush() is called. + void FlushRequested() { flush_requested_ = true; } + + bool HasFlushRequested() { return flush_requested_; } + + // Returns true if a trim history should be scheduled and the caller should + // be the one to schedule it + bool MarkTrimHistoryNeeded() { + auto expected = false; + return imm_trim_needed.compare_exchange_strong( + expected, true, std::memory_order_relaxed, std::memory_order_relaxed); + } + + void ResetTrimHistoryNeeded() { + auto expected = true; + imm_trim_needed.compare_exchange_strong( + expected, false, std::memory_order_relaxed, std::memory_order_relaxed); + } + + // Copying allowed + // MemTableList(const MemTableList&); + // void operator=(const MemTableList&); + + size_t* current_memory_usage() { return ¤t_memory_usage_; } + + // Returns the min log containing the prep section after memtables listsed in + // `memtables_to_flush` are flushed and their status is persisted in manifest. + uint64_t PrecomputeMinLogContainingPrepSection( + const autovector& memtables_to_flush); + + uint64_t GetEarliestMemTableID() const { + auto& memlist = current_->memlist_; + if (memlist.empty()) { + return std::numeric_limits::max(); + } + return memlist.back()->GetID(); + } + + uint64_t GetLatestMemTableID() const { + auto& memlist = current_->memlist_; + if (memlist.empty()) { + return 0; + } + return memlist.front()->GetID(); + } + + void AssignAtomicFlushSeq(const SequenceNumber& seq) { + const auto& memlist = current_->memlist_; + // Scan the memtable list from new to old + for (auto it = memlist.begin(); it != memlist.end(); ++it) { + MemTable* mem = *it; + if (mem->atomic_flush_seqno_ == kMaxSequenceNumber) { + mem->atomic_flush_seqno_ = seq; + } else { + // Earlier memtables must have been assigned a atomic flush seq, no + // need to continue scan. + break; + } + } + } + + // Used only by DBImplSecondary during log replay. + // Remove memtables whose data were written before the WAL with log_number + // was created, i.e. mem->GetNextLogNumber() <= log_number. The memtables are + // not freed, but put into a vector for future deref and reclamation. + void RemoveOldMemTables(uint64_t log_number, + autovector* to_delete); + + private: + friend Status InstallMemtableAtomicFlushResults( + const autovector* imm_lists, + const autovector& cfds, + const autovector& mutable_cf_options_list, + const autovector*>& mems_list, + VersionSet* vset, InstrumentedMutex* mu, + const autovector& file_meta, + autovector* to_delete, Directory* db_directory, + LogBuffer* log_buffer); + + // DB mutex held + void InstallNewVersion(); + + const int min_write_buffer_number_to_merge_; + + MemTableListVersion* current_; + + // the number of elements that still need flushing + int num_flush_not_started_; + + // committing in progress + bool commit_in_progress_; + + // Requested a flush of memtables to storage. It's possible to request that + // a subset of memtables be flushed. + bool flush_requested_; + + // The current memory usage. + size_t current_memory_usage_; + + // Cached value of current_->ApproximateMemoryUsageExcludingLast(). + std::atomic current_memory_usage_excluding_last_; + + // Cached value of current_->HasHistory(). + std::atomic current_has_history_; +}; + +// Installs memtable atomic flush results. +// In most cases, imm_lists is nullptr, and the function simply uses the +// immutable memtable lists associated with the cfds. There are unit tests that +// installs flush results for external immutable memtable lists other than the +// cfds' own immutable memtable lists, e.g. MemTableLIstTest. In this case, +// imm_lists parameter is not nullptr. +extern Status InstallMemtableAtomicFlushResults( + const autovector* imm_lists, + const autovector& cfds, + const autovector& mutable_cf_options_list, + const autovector*>& mems_list, VersionSet* vset, + InstrumentedMutex* mu, const autovector& file_meta, + autovector* to_delete, Directory* db_directory, + LogBuffer* log_buffer); +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/memtable_list_test.cc b/src/rocksdb/db/memtable_list_test.cc new file mode 100644 index 000000000..a92bc6c79 --- /dev/null +++ b/src/rocksdb/db/memtable_list_test.cc @@ -0,0 +1,922 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/memtable_list.h" +#include +#include +#include +#include "db/merge_context.h" +#include "db/version_set.h" +#include "db/write_controller.h" +#include "rocksdb/db.h" +#include "rocksdb/status.h" +#include "rocksdb/write_buffer_manager.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +class MemTableListTest : public testing::Test { + public: + std::string dbname; + DB* db; + Options options; + std::vector handles; + std::atomic file_number; + + MemTableListTest() : db(nullptr), file_number(1) { + dbname = test::PerThreadDBPath("memtable_list_test"); + options.create_if_missing = true; + DestroyDB(dbname, options); + } + + // Create a test db if not yet created + void CreateDB() { + if (db == nullptr) { + options.create_if_missing = true; + DestroyDB(dbname, options); + // Open DB only with default column family + ColumnFamilyOptions cf_options; + std::vector cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, cf_options); + Status s = DB::Open(options, dbname, cf_descs, &handles, &db); + EXPECT_OK(s); + + ColumnFamilyOptions cf_opt1, cf_opt2; + cf_opt1.cf_paths.emplace_back(dbname + "_one_1", + std::numeric_limits::max()); + cf_opt2.cf_paths.emplace_back(dbname + "_two_1", + std::numeric_limits::max()); + int sz = static_cast(handles.size()); + handles.resize(sz + 2); + s = db->CreateColumnFamily(cf_opt1, "one", &handles[1]); + EXPECT_OK(s); + s = db->CreateColumnFamily(cf_opt2, "two", &handles[2]); + EXPECT_OK(s); + + cf_descs.emplace_back("one", cf_options); + cf_descs.emplace_back("two", cf_options); + } + } + + ~MemTableListTest() override { + if (db) { + std::vector cf_descs(handles.size()); + for (int i = 0; i != static_cast(handles.size()); ++i) { + handles[i]->GetDescriptor(&cf_descs[i]); + } + for (auto h : handles) { + if (h) { + db->DestroyColumnFamilyHandle(h); + } + } + handles.clear(); + delete db; + db = nullptr; + DestroyDB(dbname, options, cf_descs); + } + } + + // Calls MemTableList::TryInstallMemtableFlushResults() and sets up all + // structures needed to call this function. + Status Mock_InstallMemtableFlushResults( + MemTableList* list, const MutableCFOptions& mutable_cf_options, + const autovector& m, autovector* to_delete) { + // Create a mock Logger + test::NullLogger logger; + LogBuffer log_buffer(DEBUG_LEVEL, &logger); + + CreateDB(); + // Create a mock VersionSet + DBOptions db_options; + db_options.file_system = FileSystem::Default(); + ImmutableDBOptions immutable_db_options(db_options); + EnvOptions env_options; + std::shared_ptr table_cache(NewLRUCache(50000, 16)); + WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size); + WriteController write_controller(10000000u); + + VersionSet versions(dbname, &immutable_db_options, env_options, + table_cache.get(), &write_buffer_manager, + &write_controller, /*block_cache_tracer=*/nullptr); + std::vector cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions()); + cf_descs.emplace_back("one", ColumnFamilyOptions()); + cf_descs.emplace_back("two", ColumnFamilyOptions()); + + EXPECT_OK(versions.Recover(cf_descs, false)); + + // Create mock default ColumnFamilyData + auto column_family_set = versions.GetColumnFamilySet(); + LogsWithPrepTracker dummy_prep_tracker; + auto cfd = column_family_set->GetDefault(); + EXPECT_TRUE(nullptr != cfd); + uint64_t file_num = file_number.fetch_add(1); + // Create dummy mutex. + InstrumentedMutex mutex; + InstrumentedMutexLock l(&mutex); + std::list> flush_jobs_info; + Status s = list->TryInstallMemtableFlushResults( + cfd, mutable_cf_options, m, &dummy_prep_tracker, &versions, &mutex, + file_num, to_delete, nullptr, &log_buffer, &flush_jobs_info); + return s; + } + + // Calls MemTableList::InstallMemtableFlushResults() and sets up all + // structures needed to call this function. + Status Mock_InstallMemtableAtomicFlushResults( + autovector& lists, const autovector& cf_ids, + const autovector& mutable_cf_options_list, + const autovector*>& mems_list, + autovector* to_delete) { + // Create a mock Logger + test::NullLogger logger; + LogBuffer log_buffer(DEBUG_LEVEL, &logger); + + CreateDB(); + // Create a mock VersionSet + DBOptions db_options; + db_options.file_system.reset(new LegacyFileSystemWrapper(db_options.env)); + + ImmutableDBOptions immutable_db_options(db_options); + EnvOptions env_options; + std::shared_ptr table_cache(NewLRUCache(50000, 16)); + WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size); + WriteController write_controller(10000000u); + + VersionSet versions(dbname, &immutable_db_options, env_options, + table_cache.get(), &write_buffer_manager, + &write_controller, /*block_cache_tracer=*/nullptr); + std::vector cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions()); + cf_descs.emplace_back("one", ColumnFamilyOptions()); + cf_descs.emplace_back("two", ColumnFamilyOptions()); + EXPECT_OK(versions.Recover(cf_descs, false)); + + // Create mock default ColumnFamilyData + + auto column_family_set = versions.GetColumnFamilySet(); + + LogsWithPrepTracker dummy_prep_tracker; + autovector cfds; + for (int i = 0; i != static_cast(cf_ids.size()); ++i) { + cfds.emplace_back(column_family_set->GetColumnFamily(cf_ids[i])); + EXPECT_NE(nullptr, cfds[i]); + } + std::vector file_metas; + file_metas.reserve(cf_ids.size()); + for (size_t i = 0; i != cf_ids.size(); ++i) { + FileMetaData meta; + uint64_t file_num = file_number.fetch_add(1); + meta.fd = FileDescriptor(file_num, 0, 0); + file_metas.emplace_back(meta); + } + autovector file_meta_ptrs; + for (auto& meta : file_metas) { + file_meta_ptrs.push_back(&meta); + } + InstrumentedMutex mutex; + InstrumentedMutexLock l(&mutex); + return InstallMemtableAtomicFlushResults( + &lists, cfds, mutable_cf_options_list, mems_list, &versions, &mutex, + file_meta_ptrs, to_delete, nullptr, &log_buffer); + } +}; + +TEST_F(MemTableListTest, Empty) { + // Create an empty MemTableList and validate basic functions. + MemTableList list(1, 0, 0); + + ASSERT_EQ(0, list.NumNotFlushed()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + ASSERT_FALSE(list.IsFlushPending()); + + autovector mems; + list.PickMemtablesToFlush(nullptr /* memtable_id */, &mems); + ASSERT_EQ(0, mems.size()); + + autovector to_delete; + list.current()->Unref(&to_delete); + ASSERT_EQ(0, to_delete.size()); +} + +TEST_F(MemTableListTest, GetTest) { + // Create MemTableList + int min_write_buffer_number_to_merge = 2; + int max_write_buffer_number_to_maintain = 0; + int64_t max_write_buffer_size_to_maintain = 0; + MemTableList list(min_write_buffer_number_to_merge, + max_write_buffer_number_to_maintain, + max_write_buffer_size_to_maintain); + + SequenceNumber seq = 1; + std::string value; + Status s; + MergeContext merge_context; + InternalKeyComparator ikey_cmp(options.comparator); + SequenceNumber max_covering_tombstone_seq = 0; + autovector to_delete; + + LookupKey lkey("key1", seq); + bool found = list.current()->Get(lkey, &value, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); + ASSERT_FALSE(found); + + // Create a MemTable + InternalKeyComparator cmp(BytewiseComparator()); + auto factory = std::make_shared(); + options.memtable_factory = factory; + ImmutableCFOptions ioptions(options); + + WriteBufferManager wb(options.db_write_buffer_size); + MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb, + kMaxSequenceNumber, 0 /* column_family_id */); + mem->Ref(); + + // Write some keys to this memtable. + mem->Add(++seq, kTypeDeletion, "key1", ""); + mem->Add(++seq, kTypeValue, "key2", "value2"); + mem->Add(++seq, kTypeValue, "key1", "value1"); + mem->Add(++seq, kTypeValue, "key2", "value2.2"); + + // Fetch the newly written keys + merge_context.Clear(); + found = mem->Get(LookupKey("key1", seq), &value, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); + ASSERT_TRUE(s.ok() && found); + ASSERT_EQ(value, "value1"); + + merge_context.Clear(); + found = mem->Get(LookupKey("key1", 2), &value, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); + // MemTable found out that this key is *not* found (at this sequence#) + ASSERT_TRUE(found && s.IsNotFound()); + + merge_context.Clear(); + found = mem->Get(LookupKey("key2", seq), &value, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); + ASSERT_TRUE(s.ok() && found); + ASSERT_EQ(value, "value2.2"); + + ASSERT_EQ(4, mem->num_entries()); + ASSERT_EQ(1, mem->num_deletes()); + + // Add memtable to list + list.Add(mem, &to_delete); + + SequenceNumber saved_seq = seq; + + // Create another memtable and write some keys to it + WriteBufferManager wb2(options.db_write_buffer_size); + MemTable* mem2 = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb2, + kMaxSequenceNumber, 0 /* column_family_id */); + mem2->Ref(); + + mem2->Add(++seq, kTypeDeletion, "key1", ""); + mem2->Add(++seq, kTypeValue, "key2", "value2.3"); + + // Add second memtable to list + list.Add(mem2, &to_delete); + + // Fetch keys via MemTableList + merge_context.Clear(); + found = + list.current()->Get(LookupKey("key1", seq), &value, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); + ASSERT_TRUE(found && s.IsNotFound()); + + merge_context.Clear(); + found = list.current()->Get(LookupKey("key1", saved_seq), &value, &s, + &merge_context, &max_covering_tombstone_seq, + ReadOptions()); + ASSERT_TRUE(s.ok() && found); + ASSERT_EQ("value1", value); + + merge_context.Clear(); + found = + list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); + ASSERT_TRUE(s.ok() && found); + ASSERT_EQ(value, "value2.3"); + + merge_context.Clear(); + found = list.current()->Get(LookupKey("key2", 1), &value, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); + ASSERT_FALSE(found); + + ASSERT_EQ(2, list.NumNotFlushed()); + + list.current()->Unref(&to_delete); + for (MemTable* m : to_delete) { + delete m; + } +} + +TEST_F(MemTableListTest, GetFromHistoryTest) { + // Create MemTableList + int min_write_buffer_number_to_merge = 2; + int max_write_buffer_number_to_maintain = 2; + int64_t max_write_buffer_size_to_maintain = 2000; + MemTableList list(min_write_buffer_number_to_merge, + max_write_buffer_number_to_maintain, + max_write_buffer_size_to_maintain); + + SequenceNumber seq = 1; + std::string value; + Status s; + MergeContext merge_context; + InternalKeyComparator ikey_cmp(options.comparator); + SequenceNumber max_covering_tombstone_seq = 0; + autovector to_delete; + + LookupKey lkey("key1", seq); + bool found = list.current()->Get(lkey, &value, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); + ASSERT_FALSE(found); + + // Create a MemTable + InternalKeyComparator cmp(BytewiseComparator()); + auto factory = std::make_shared(); + options.memtable_factory = factory; + ImmutableCFOptions ioptions(options); + + WriteBufferManager wb(options.db_write_buffer_size); + MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb, + kMaxSequenceNumber, 0 /* column_family_id */); + mem->Ref(); + + // Write some keys to this memtable. + mem->Add(++seq, kTypeDeletion, "key1", ""); + mem->Add(++seq, kTypeValue, "key2", "value2"); + mem->Add(++seq, kTypeValue, "key2", "value2.2"); + + // Fetch the newly written keys + merge_context.Clear(); + found = mem->Get(LookupKey("key1", seq), &value, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); + // MemTable found out that this key is *not* found (at this sequence#) + ASSERT_TRUE(found && s.IsNotFound()); + + merge_context.Clear(); + found = mem->Get(LookupKey("key2", seq), &value, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); + ASSERT_TRUE(s.ok() && found); + ASSERT_EQ(value, "value2.2"); + + // Add memtable to list + list.Add(mem, &to_delete); + ASSERT_EQ(0, to_delete.size()); + + // Fetch keys via MemTableList + merge_context.Clear(); + found = + list.current()->Get(LookupKey("key1", seq), &value, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); + ASSERT_TRUE(found && s.IsNotFound()); + + merge_context.Clear(); + found = + list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); + ASSERT_TRUE(s.ok() && found); + ASSERT_EQ("value2.2", value); + + // Flush this memtable from the list. + // (It will then be a part of the memtable history). + autovector to_flush; + list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush); + ASSERT_EQ(1, to_flush.size()); + + MutableCFOptions mutable_cf_options(options); + s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush, + &to_delete); + ASSERT_OK(s); + ASSERT_EQ(0, list.NumNotFlushed()); + ASSERT_EQ(1, list.NumFlushed()); + ASSERT_EQ(0, to_delete.size()); + + // Verify keys are no longer in MemTableList + merge_context.Clear(); + found = + list.current()->Get(LookupKey("key1", seq), &value, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); + ASSERT_FALSE(found); + + merge_context.Clear(); + found = + list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); + ASSERT_FALSE(found); + + // Verify keys are present in history + merge_context.Clear(); + found = list.current()->GetFromHistory( + LookupKey("key1", seq), &value, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); + ASSERT_TRUE(found && s.IsNotFound()); + + merge_context.Clear(); + found = list.current()->GetFromHistory( + LookupKey("key2", seq), &value, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); + ASSERT_TRUE(found); + ASSERT_EQ("value2.2", value); + + // Create another memtable and write some keys to it + WriteBufferManager wb2(options.db_write_buffer_size); + MemTable* mem2 = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb2, + kMaxSequenceNumber, 0 /* column_family_id */); + mem2->Ref(); + + mem2->Add(++seq, kTypeDeletion, "key1", ""); + mem2->Add(++seq, kTypeValue, "key3", "value3"); + + // Add second memtable to list + list.Add(mem2, &to_delete); + ASSERT_EQ(0, to_delete.size()); + + to_flush.clear(); + list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush); + ASSERT_EQ(1, to_flush.size()); + + // Flush second memtable + s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush, + &to_delete); + ASSERT_OK(s); + ASSERT_EQ(0, list.NumNotFlushed()); + ASSERT_EQ(2, list.NumFlushed()); + ASSERT_EQ(0, to_delete.size()); + + // Add a third memtable to push the first memtable out of the history + WriteBufferManager wb3(options.db_write_buffer_size); + MemTable* mem3 = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb3, + kMaxSequenceNumber, 0 /* column_family_id */); + mem3->Ref(); + list.Add(mem3, &to_delete); + ASSERT_EQ(1, list.NumNotFlushed()); + ASSERT_EQ(1, list.NumFlushed()); + ASSERT_EQ(1, to_delete.size()); + + // Verify keys are no longer in MemTableList + merge_context.Clear(); + found = + list.current()->Get(LookupKey("key1", seq), &value, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); + ASSERT_FALSE(found); + + merge_context.Clear(); + found = + list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); + ASSERT_FALSE(found); + + merge_context.Clear(); + found = + list.current()->Get(LookupKey("key3", seq), &value, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); + ASSERT_FALSE(found); + + // Verify that the second memtable's keys are in the history + merge_context.Clear(); + found = list.current()->GetFromHistory( + LookupKey("key1", seq), &value, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); + ASSERT_TRUE(found && s.IsNotFound()); + + merge_context.Clear(); + found = list.current()->GetFromHistory( + LookupKey("key3", seq), &value, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); + ASSERT_TRUE(found); + ASSERT_EQ("value3", value); + + // Verify that key2 from the first memtable is no longer in the history + merge_context.Clear(); + found = + list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); + ASSERT_FALSE(found); + + // Cleanup + list.current()->Unref(&to_delete); + ASSERT_EQ(3, to_delete.size()); + for (MemTable* m : to_delete) { + delete m; + } +} + +TEST_F(MemTableListTest, FlushPendingTest) { + const int num_tables = 6; + SequenceNumber seq = 1; + Status s; + + auto factory = std::make_shared(); + options.memtable_factory = factory; + ImmutableCFOptions ioptions(options); + InternalKeyComparator cmp(BytewiseComparator()); + WriteBufferManager wb(options.db_write_buffer_size); + autovector to_delete; + + // Create MemTableList + int min_write_buffer_number_to_merge = 3; + int max_write_buffer_number_to_maintain = 7; + int64_t max_write_buffer_size_to_maintain = + 7 * static_cast(options.write_buffer_size); + MemTableList list(min_write_buffer_number_to_merge, + max_write_buffer_number_to_maintain, + max_write_buffer_size_to_maintain); + + // Create some MemTables + uint64_t memtable_id = 0; + std::vector tables; + MutableCFOptions mutable_cf_options(options); + for (int i = 0; i < num_tables; i++) { + MemTable* mem = new MemTable(cmp, ioptions, mutable_cf_options, &wb, + kMaxSequenceNumber, 0 /* column_family_id */); + mem->SetID(memtable_id++); + mem->Ref(); + + std::string value; + MergeContext merge_context; + + mem->Add(++seq, kTypeValue, "key1", ToString(i)); + mem->Add(++seq, kTypeValue, "keyN" + ToString(i), "valueN"); + mem->Add(++seq, kTypeValue, "keyX" + ToString(i), "value"); + mem->Add(++seq, kTypeValue, "keyM" + ToString(i), "valueM"); + mem->Add(++seq, kTypeDeletion, "keyX" + ToString(i), ""); + + tables.push_back(mem); + } + + // Nothing to flush + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + autovector to_flush; + list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush); + ASSERT_EQ(0, to_flush.size()); + + // Request a flush even though there is nothing to flush + list.FlushRequested(); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + + // Attempt to 'flush' to clear request for flush + list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush); + ASSERT_EQ(0, to_flush.size()); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + + // Request a flush again + list.FlushRequested(); + // No flush pending since the list is empty. + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + + // Add 2 tables + list.Add(tables[0], &to_delete); + list.Add(tables[1], &to_delete); + ASSERT_EQ(2, list.NumNotFlushed()); + ASSERT_EQ(0, to_delete.size()); + + // Even though we have less than the minimum to flush, a flush is + // pending since we had previously requested a flush and never called + // PickMemtablesToFlush() to clear the flush. + ASSERT_TRUE(list.IsFlushPending()); + ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); + + // Pick tables to flush + list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush); + ASSERT_EQ(2, to_flush.size()); + ASSERT_EQ(2, list.NumNotFlushed()); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + + // Revert flush + list.RollbackMemtableFlush(to_flush, 0); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); + to_flush.clear(); + + // Add another table + list.Add(tables[2], &to_delete); + // We now have the minimum to flush regardles of whether FlushRequested() + // was called. + ASSERT_TRUE(list.IsFlushPending()); + ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); + ASSERT_EQ(0, to_delete.size()); + + // Pick tables to flush + list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush); + ASSERT_EQ(3, to_flush.size()); + ASSERT_EQ(3, list.NumNotFlushed()); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + + // Pick tables to flush again + autovector to_flush2; + list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush2); + ASSERT_EQ(0, to_flush2.size()); + ASSERT_EQ(3, list.NumNotFlushed()); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + + // Add another table + list.Add(tables[3], &to_delete); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); + ASSERT_EQ(0, to_delete.size()); + + // Request a flush again + list.FlushRequested(); + ASSERT_TRUE(list.IsFlushPending()); + ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); + + // Pick tables to flush again + list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush2); + ASSERT_EQ(1, to_flush2.size()); + ASSERT_EQ(4, list.NumNotFlushed()); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + + // Rollback first pick of tables + list.RollbackMemtableFlush(to_flush, 0); + ASSERT_TRUE(list.IsFlushPending()); + ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); + to_flush.clear(); + + // Add another tables + list.Add(tables[4], &to_delete); + ASSERT_EQ(5, list.NumNotFlushed()); + // We now have the minimum to flush regardles of whether FlushRequested() + ASSERT_TRUE(list.IsFlushPending()); + ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); + ASSERT_EQ(0, to_delete.size()); + + // Pick tables to flush + list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush); + // Should pick 4 of 5 since 1 table has been picked in to_flush2 + ASSERT_EQ(4, to_flush.size()); + ASSERT_EQ(5, list.NumNotFlushed()); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + + // Pick tables to flush again + autovector to_flush3; + list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush3); + ASSERT_EQ(0, to_flush3.size()); // nothing not in progress of being flushed + ASSERT_EQ(5, list.NumNotFlushed()); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + + // Flush the 4 memtables that were picked in to_flush + s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush, + &to_delete); + ASSERT_OK(s); + + // Note: now to_flush contains tables[0,1,2,4]. to_flush2 contains + // tables[3]. + // Current implementation will only commit memtables in the order they were + // created. So TryInstallMemtableFlushResults will install the first 3 tables + // in to_flush and stop when it encounters a table not yet flushed. + ASSERT_EQ(2, list.NumNotFlushed()); + int num_in_history = + std::min(3, static_cast(max_write_buffer_size_to_maintain) / + static_cast(options.write_buffer_size)); + ASSERT_EQ(num_in_history, list.NumFlushed()); + ASSERT_EQ(5 - list.NumNotFlushed() - num_in_history, to_delete.size()); + + // Request a flush again. Should be nothing to flush + list.FlushRequested(); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + + // Flush the 1 memtable that was picked in to_flush2 + s = MemTableListTest::Mock_InstallMemtableFlushResults( + &list, mutable_cf_options, to_flush2, &to_delete); + ASSERT_OK(s); + + // This will actually install 2 tables. The 1 we told it to flush, and also + // tables[4] which has been waiting for tables[3] to commit. + ASSERT_EQ(0, list.NumNotFlushed()); + num_in_history = + std::min(5, static_cast(max_write_buffer_size_to_maintain) / + static_cast(options.write_buffer_size)); + ASSERT_EQ(num_in_history, list.NumFlushed()); + ASSERT_EQ(5 - list.NumNotFlushed() - num_in_history, to_delete.size()); + + for (const auto& m : to_delete) { + // Refcount should be 0 after calling TryInstallMemtableFlushResults. + // Verify this, by Ref'ing then UnRef'ing: + m->Ref(); + ASSERT_EQ(m, m->Unref()); + delete m; + } + to_delete.clear(); + + // Add another table + list.Add(tables[5], &to_delete); + ASSERT_EQ(1, list.NumNotFlushed()); + ASSERT_EQ(5, list.GetLatestMemTableID()); + memtable_id = 4; + // Pick tables to flush. The tables to pick must have ID smaller than or + // equal to 4. Therefore, no table will be selected in this case. + autovector to_flush4; + list.FlushRequested(); + ASSERT_TRUE(list.HasFlushRequested()); + list.PickMemtablesToFlush(&memtable_id, &to_flush4); + ASSERT_TRUE(to_flush4.empty()); + ASSERT_EQ(1, list.NumNotFlushed()); + ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.HasFlushRequested()); + + // Pick tables to flush. The tables to pick must have ID smaller than or + // equal to 5. Therefore, only tables[5] will be selected. + memtable_id = 5; + list.FlushRequested(); + list.PickMemtablesToFlush(&memtable_id, &to_flush4); + ASSERT_EQ(1, static_cast(to_flush4.size())); + ASSERT_EQ(1, list.NumNotFlushed()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + ASSERT_FALSE(list.IsFlushPending()); + to_delete.clear(); + + list.current()->Unref(&to_delete); + int to_delete_size = + std::min(num_tables, static_cast(max_write_buffer_size_to_maintain) / + static_cast(options.write_buffer_size)); + ASSERT_EQ(to_delete_size, to_delete.size()); + + for (const auto& m : to_delete) { + // Refcount should be 0 after calling TryInstallMemtableFlushResults. + // Verify this, by Ref'ing then UnRef'ing: + m->Ref(); + ASSERT_EQ(m, m->Unref()); + delete m; + } + to_delete.clear(); +} + +TEST_F(MemTableListTest, EmptyAtomicFlusTest) { + autovector lists; + autovector cf_ids; + autovector options_list; + autovector*> to_flush; + autovector to_delete; + Status s = Mock_InstallMemtableAtomicFlushResults(lists, cf_ids, options_list, + to_flush, &to_delete); + ASSERT_OK(s); + ASSERT_TRUE(to_delete.empty()); +} + +TEST_F(MemTableListTest, AtomicFlusTest) { + const int num_cfs = 3; + const int num_tables_per_cf = 2; + SequenceNumber seq = 1; + + auto factory = std::make_shared(); + options.memtable_factory = factory; + ImmutableCFOptions ioptions(options); + InternalKeyComparator cmp(BytewiseComparator()); + WriteBufferManager wb(options.db_write_buffer_size); + + // Create MemTableLists + int min_write_buffer_number_to_merge = 3; + int max_write_buffer_number_to_maintain = 7; + int64_t max_write_buffer_size_to_maintain = + 7 * static_cast(options.write_buffer_size); + autovector lists; + for (int i = 0; i != num_cfs; ++i) { + lists.emplace_back(new MemTableList(min_write_buffer_number_to_merge, + max_write_buffer_number_to_maintain, + max_write_buffer_size_to_maintain)); + } + + autovector cf_ids; + std::vector> tables(num_cfs); + autovector mutable_cf_options_list; + uint32_t cf_id = 0; + for (auto& elem : tables) { + mutable_cf_options_list.emplace_back(new MutableCFOptions(options)); + uint64_t memtable_id = 0; + for (int i = 0; i != num_tables_per_cf; ++i) { + MemTable* mem = + new MemTable(cmp, ioptions, *(mutable_cf_options_list.back()), &wb, + kMaxSequenceNumber, cf_id); + mem->SetID(memtable_id++); + mem->Ref(); + + std::string value; + + mem->Add(++seq, kTypeValue, "key1", ToString(i)); + mem->Add(++seq, kTypeValue, "keyN" + ToString(i), "valueN"); + mem->Add(++seq, kTypeValue, "keyX" + ToString(i), "value"); + mem->Add(++seq, kTypeValue, "keyM" + ToString(i), "valueM"); + mem->Add(++seq, kTypeDeletion, "keyX" + ToString(i), ""); + + elem.push_back(mem); + } + cf_ids.push_back(cf_id++); + } + + std::vector> flush_candidates(num_cfs); + + // Nothing to flush + for (auto i = 0; i != num_cfs; ++i) { + auto* list = lists[i]; + ASSERT_FALSE(list->IsFlushPending()); + ASSERT_FALSE(list->imm_flush_needed.load(std::memory_order_acquire)); + list->PickMemtablesToFlush(nullptr /* memtable_id */, &flush_candidates[i]); + ASSERT_EQ(0, flush_candidates[i].size()); + } + // Request flush even though there is nothing to flush + for (auto i = 0; i != num_cfs; ++i) { + auto* list = lists[i]; + list->FlushRequested(); + ASSERT_FALSE(list->IsFlushPending()); + ASSERT_FALSE(list->imm_flush_needed.load(std::memory_order_acquire)); + } + autovector to_delete; + // Add tables to the immutable memtalbe lists associated with column families + for (auto i = 0; i != num_cfs; ++i) { + for (auto j = 0; j != num_tables_per_cf; ++j) { + lists[i]->Add(tables[i][j], &to_delete); + } + ASSERT_EQ(num_tables_per_cf, lists[i]->NumNotFlushed()); + ASSERT_TRUE(lists[i]->IsFlushPending()); + ASSERT_TRUE(lists[i]->imm_flush_needed.load(std::memory_order_acquire)); + } + std::vector flush_memtable_ids = {1, 1, 0}; + // +----+ + // list[0]: |0 1| + // list[1]: |0 1| + // | +--+ + // list[2]: |0| 1 + // +-+ + // Pick memtables to flush + for (auto i = 0; i != num_cfs; ++i) { + flush_candidates[i].clear(); + lists[i]->PickMemtablesToFlush(&flush_memtable_ids[i], + &flush_candidates[i]); + ASSERT_EQ(flush_memtable_ids[i] - 0 + 1, + static_cast(flush_candidates[i].size())); + } + autovector tmp_lists; + autovector tmp_cf_ids; + autovector tmp_options_list; + autovector*> to_flush; + for (auto i = 0; i != num_cfs; ++i) { + if (!flush_candidates[i].empty()) { + to_flush.push_back(&flush_candidates[i]); + tmp_lists.push_back(lists[i]); + tmp_cf_ids.push_back(i); + tmp_options_list.push_back(mutable_cf_options_list[i]); + } + } + Status s = Mock_InstallMemtableAtomicFlushResults( + tmp_lists, tmp_cf_ids, tmp_options_list, to_flush, &to_delete); + ASSERT_OK(s); + + for (auto i = 0; i != num_cfs; ++i) { + for (auto j = 0; j != num_tables_per_cf; ++j) { + if (static_cast(j) <= flush_memtable_ids[i]) { + ASSERT_LT(0, tables[i][j]->GetFileNumber()); + } + } + ASSERT_EQ( + static_cast(num_tables_per_cf) - flush_candidates[i].size(), + lists[i]->NumNotFlushed()); + } + + to_delete.clear(); + for (auto list : lists) { + list->current()->Unref(&to_delete); + delete list; + } + for (auto& mutable_cf_options : mutable_cf_options_list) { + if (mutable_cf_options != nullptr) { + delete mutable_cf_options; + mutable_cf_options = nullptr; + } + } + // All memtables in tables array must have been flushed, thus ready to be + // deleted. + ASSERT_EQ(to_delete.size(), tables.size() * tables.front().size()); + for (const auto& m : to_delete) { + // Refcount should be 0 after calling InstallMemtableFlushResults. + // Verify this by Ref'ing and then Unref'ing. + m->Ref(); + ASSERT_EQ(m, m->Unref()); + delete m; + } +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/merge_context.h b/src/rocksdb/db/merge_context.h new file mode 100644 index 000000000..e1869a341 --- /dev/null +++ b/src/rocksdb/db/merge_context.h @@ -0,0 +1,134 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once +#include +#include +#include +#include +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { + +const std::vector empty_operand_list; + +// The merge context for merging a user key. +// When doing a Get(), DB will create such a class and pass it when +// issuing Get() operation to memtables and version_set. The operands +// will be fetched from the context when issuing partial of full merge. +class MergeContext { + public: + // Clear all the operands + void Clear() { + if (operand_list_) { + operand_list_->clear(); + copied_operands_->clear(); + } + } + + // Push a merge operand + void PushOperand(const Slice& operand_slice, bool operand_pinned = false) { + Initialize(); + SetDirectionBackward(); + + if (operand_pinned) { + operand_list_->push_back(operand_slice); + } else { + // We need to have our own copy of the operand since it's not pinned + copied_operands_->emplace_back( + new std::string(operand_slice.data(), operand_slice.size())); + operand_list_->push_back(*copied_operands_->back()); + } + } + + // Push back a merge operand + void PushOperandBack(const Slice& operand_slice, + bool operand_pinned = false) { + Initialize(); + SetDirectionForward(); + + if (operand_pinned) { + operand_list_->push_back(operand_slice); + } else { + // We need to have our own copy of the operand since it's not pinned + copied_operands_->emplace_back( + new std::string(operand_slice.data(), operand_slice.size())); + operand_list_->push_back(*copied_operands_->back()); + } + } + + // return total number of operands in the list + size_t GetNumOperands() const { + if (!operand_list_) { + return 0; + } + return operand_list_->size(); + } + + // Get the operand at the index. + Slice GetOperand(int index) { + assert(operand_list_); + + SetDirectionForward(); + return (*operand_list_)[index]; + } + + // Same as GetOperandsDirectionForward + const std::vector& GetOperands() { + return GetOperandsDirectionForward(); + } + + // Return all the operands in the order as they were merged (passed to + // FullMerge or FullMergeV2) + const std::vector& GetOperandsDirectionForward() { + if (!operand_list_) { + return empty_operand_list; + } + + SetDirectionForward(); + return *operand_list_; + } + + // Return all the operands in the reversed order relative to how they were + // merged (passed to FullMerge or FullMergeV2) + const std::vector& GetOperandsDirectionBackward() { + if (!operand_list_) { + return empty_operand_list; + } + + SetDirectionBackward(); + return *operand_list_; + } + + private: + void Initialize() { + if (!operand_list_) { + operand_list_.reset(new std::vector()); + copied_operands_.reset(new std::vector>()); + } + } + + void SetDirectionForward() { + if (operands_reversed_ == true) { + std::reverse(operand_list_->begin(), operand_list_->end()); + operands_reversed_ = false; + } + } + + void SetDirectionBackward() { + if (operands_reversed_ == false) { + std::reverse(operand_list_->begin(), operand_list_->end()); + operands_reversed_ = true; + } + } + + // List of operands + std::unique_ptr> operand_list_; + // Copy of operands that are not pinned. + std::unique_ptr>> copied_operands_; + bool operands_reversed_ = true; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/merge_helper.cc b/src/rocksdb/db/merge_helper.cc new file mode 100644 index 000000000..96fe79251 --- /dev/null +++ b/src/rocksdb/db/merge_helper.cc @@ -0,0 +1,417 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/merge_helper.h" + +#include + +#include "db/dbformat.h" +#include "monitoring/perf_context_imp.h" +#include "monitoring/statistics.h" +#include "port/likely.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/merge_operator.h" +#include "table/format.h" +#include "table/internal_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +MergeHelper::MergeHelper(Env* env, const Comparator* user_comparator, + const MergeOperator* user_merge_operator, + const CompactionFilter* compaction_filter, + Logger* logger, bool assert_valid_internal_key, + SequenceNumber latest_snapshot, + const SnapshotChecker* snapshot_checker, int level, + Statistics* stats, + const std::atomic* shutting_down) + : env_(env), + user_comparator_(user_comparator), + user_merge_operator_(user_merge_operator), + compaction_filter_(compaction_filter), + shutting_down_(shutting_down), + logger_(logger), + assert_valid_internal_key_(assert_valid_internal_key), + allow_single_operand_(false), + latest_snapshot_(latest_snapshot), + snapshot_checker_(snapshot_checker), + level_(level), + keys_(), + filter_timer_(env_), + total_filter_time_(0U), + stats_(stats) { + assert(user_comparator_ != nullptr); + if (user_merge_operator_) { + allow_single_operand_ = user_merge_operator_->AllowSingleOperand(); + } +} + +Status MergeHelper::TimedFullMerge(const MergeOperator* merge_operator, + const Slice& key, const Slice* value, + const std::vector& operands, + std::string* result, Logger* logger, + Statistics* statistics, Env* env, + Slice* result_operand, + bool update_num_ops_stats) { + assert(merge_operator != nullptr); + + if (operands.size() == 0) { + assert(value != nullptr && result != nullptr); + result->assign(value->data(), value->size()); + return Status::OK(); + } + + if (update_num_ops_stats) { + RecordInHistogram(statistics, READ_NUM_MERGE_OPERANDS, + static_cast(operands.size())); + } + + bool success; + Slice tmp_result_operand(nullptr, 0); + const MergeOperator::MergeOperationInput merge_in(key, value, operands, + logger); + MergeOperator::MergeOperationOutput merge_out(*result, tmp_result_operand); + { + // Setup to time the merge + StopWatchNano timer(env, statistics != nullptr); + PERF_TIMER_GUARD(merge_operator_time_nanos); + + // Do the merge + success = merge_operator->FullMergeV2(merge_in, &merge_out); + + if (tmp_result_operand.data()) { + // FullMergeV2 result is an existing operand + if (result_operand != nullptr) { + *result_operand = tmp_result_operand; + } else { + result->assign(tmp_result_operand.data(), tmp_result_operand.size()); + } + } else if (result_operand) { + *result_operand = Slice(nullptr, 0); + } + + RecordTick(statistics, MERGE_OPERATION_TOTAL_TIME, + statistics ? timer.ElapsedNanos() : 0); + } + + if (!success) { + RecordTick(statistics, NUMBER_MERGE_FAILURES); + return Status::Corruption("Error: Could not perform merge."); + } + + return Status::OK(); +} + +// PRE: iter points to the first merge type entry +// POST: iter points to the first entry beyond the merge process (or the end) +// keys_, operands_ are updated to reflect the merge result. +// keys_ stores the list of keys encountered while merging. +// operands_ stores the list of merge operands encountered while merging. +// keys_[i] corresponds to operands_[i] for each i. +// +// TODO: Avoid the snapshot stripe map lookup in CompactionRangeDelAggregator +// and just pass the StripeRep corresponding to the stripe being merged. +Status MergeHelper::MergeUntil(InternalIterator* iter, + CompactionRangeDelAggregator* range_del_agg, + const SequenceNumber stop_before, + const bool at_bottom) { + // Get a copy of the internal key, before it's invalidated by iter->Next() + // Also maintain the list of merge operands seen. + assert(HasOperator()); + keys_.clear(); + merge_context_.Clear(); + has_compaction_filter_skip_until_ = false; + assert(user_merge_operator_); + bool first_key = true; + + // We need to parse the internal key again as the parsed key is + // backed by the internal key! + // Assume no internal key corruption as it has been successfully parsed + // by the caller. + // original_key_is_iter variable is just caching the information: + // original_key_is_iter == (iter->key().ToString() == original_key) + bool original_key_is_iter = true; + std::string original_key = iter->key().ToString(); + // Important: + // orig_ikey is backed by original_key if keys_.empty() + // orig_ikey is backed by keys_.back() if !keys_.empty() + ParsedInternalKey orig_ikey; + bool succ = ParseInternalKey(original_key, &orig_ikey); + assert(succ); + if (!succ) { + return Status::Corruption("Cannot parse key in MergeUntil"); + } + + Status s; + bool hit_the_next_user_key = false; + for (; iter->Valid(); iter->Next(), original_key_is_iter = false) { + if (IsShuttingDown()) { + return Status::ShutdownInProgress(); + } + + ParsedInternalKey ikey; + assert(keys_.size() == merge_context_.GetNumOperands()); + + if (!ParseInternalKey(iter->key(), &ikey)) { + // stop at corrupted key + if (assert_valid_internal_key_) { + assert(!"Corrupted internal key not expected."); + return Status::Corruption("Corrupted internal key not expected."); + } + break; + } else if (first_key) { + assert(user_comparator_->Equal(ikey.user_key, orig_ikey.user_key)); + first_key = false; + } else if (!user_comparator_->Equal(ikey.user_key, orig_ikey.user_key)) { + // hit a different user key, stop right here + hit_the_next_user_key = true; + break; + } else if (stop_before > 0 && ikey.sequence <= stop_before && + LIKELY(snapshot_checker_ == nullptr || + snapshot_checker_->CheckInSnapshot(ikey.sequence, + stop_before) != + SnapshotCheckerResult::kNotInSnapshot)) { + // hit an entry that's possibly visible by the previous snapshot, can't + // touch that + break; + } + + // At this point we are guaranteed that we need to process this key. + + assert(IsValueType(ikey.type)); + if (ikey.type != kTypeMerge) { + + // hit a put/delete/single delete + // => merge the put value or a nullptr with operands_ + // => store result in operands_.back() (and update keys_.back()) + // => change the entry type to kTypeValue for keys_.back() + // We are done! Success! + + // If there are no operands, just return the Status::OK(). That will cause + // the compaction iterator to write out the key we're currently at, which + // is the put/delete we just encountered. + if (keys_.empty()) { + return Status::OK(); + } + + // TODO(noetzli) If the merge operator returns false, we are currently + // (almost) silently dropping the put/delete. That's probably not what we + // want. Also if we're in compaction and it's a put, it would be nice to + // run compaction filter on it. + const Slice val = iter->value(); + const Slice* val_ptr; + if (kTypeValue == ikey.type && + (range_del_agg == nullptr || + !range_del_agg->ShouldDelete( + ikey, RangeDelPositioningMode::kForwardTraversal))) { + val_ptr = &val; + } else { + val_ptr = nullptr; + } + std::string merge_result; + s = TimedFullMerge(user_merge_operator_, ikey.user_key, val_ptr, + merge_context_.GetOperands(), &merge_result, logger_, + stats_, env_); + + // We store the result in keys_.back() and operands_.back() + // if nothing went wrong (i.e.: no operand corruption on disk) + if (s.ok()) { + // The original key encountered + original_key = std::move(keys_.back()); + orig_ikey.type = kTypeValue; + UpdateInternalKey(&original_key, orig_ikey.sequence, orig_ikey.type); + keys_.clear(); + merge_context_.Clear(); + keys_.emplace_front(std::move(original_key)); + merge_context_.PushOperand(merge_result); + } + + // move iter to the next entry + iter->Next(); + return s; + } else { + // hit a merge + // => if there is a compaction filter, apply it. + // => check for range tombstones covering the operand + // => merge the operand into the front of the operands_ list + // if not filtered + // => then continue because we haven't yet seen a Put/Delete. + // + // Keep queuing keys and operands until we either meet a put / delete + // request or later did a partial merge. + + Slice value_slice = iter->value(); + // add an operand to the list if: + // 1) it's included in one of the snapshots. in that case we *must* write + // it out, no matter what compaction filter says + // 2) it's not filtered by a compaction filter + CompactionFilter::Decision filter = + ikey.sequence <= latest_snapshot_ + ? CompactionFilter::Decision::kKeep + : FilterMerge(orig_ikey.user_key, value_slice); + if (filter != CompactionFilter::Decision::kRemoveAndSkipUntil && + range_del_agg != nullptr && + range_del_agg->ShouldDelete( + iter->key(), RangeDelPositioningMode::kForwardTraversal)) { + filter = CompactionFilter::Decision::kRemove; + } + if (filter == CompactionFilter::Decision::kKeep || + filter == CompactionFilter::Decision::kChangeValue) { + if (original_key_is_iter) { + // this is just an optimization that saves us one memcpy + keys_.push_front(std::move(original_key)); + } else { + keys_.push_front(iter->key().ToString()); + } + if (keys_.size() == 1) { + // we need to re-anchor the orig_ikey because it was anchored by + // original_key before + ParseInternalKey(keys_.back(), &orig_ikey); + } + if (filter == CompactionFilter::Decision::kKeep) { + merge_context_.PushOperand( + value_slice, iter->IsValuePinned() /* operand_pinned */); + } else { // kChangeValue + // Compaction filter asked us to change the operand from value_slice + // to compaction_filter_value_. + merge_context_.PushOperand(compaction_filter_value_, false); + } + } else if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil) { + // Compaction filter asked us to remove this key altogether + // (not just this operand), along with some keys following it. + keys_.clear(); + merge_context_.Clear(); + has_compaction_filter_skip_until_ = true; + return Status::OK(); + } + } + } + + if (merge_context_.GetNumOperands() == 0) { + // we filtered out all the merge operands + return Status::OK(); + } + + // We are sure we have seen this key's entire history if: + // at_bottom == true (this does not necessarily mean it is the bottommost + // layer, but rather that we are confident the key does not appear on any of + // the lower layers, at_bottom == false doesn't mean it does appear, just + // that we can't be sure, see Compaction::IsBottommostLevel for details) + // AND + // we have either encountered another key or end of key history on this + // layer. + // + // When these conditions are true we are able to merge all the keys + // using full merge. + // + // For these cases we are not sure about, we simply miss the opportunity + // to combine the keys. Since VersionSet::SetupOtherInputs() always makes + // sure that all merge-operands on the same level get compacted together, + // this will simply lead to these merge operands moving to the next level. + bool surely_seen_the_beginning = + (hit_the_next_user_key || !iter->Valid()) && at_bottom; + if (surely_seen_the_beginning) { + // do a final merge with nullptr as the existing value and say + // bye to the merge type (it's now converted to a Put) + assert(kTypeMerge == orig_ikey.type); + assert(merge_context_.GetNumOperands() >= 1); + assert(merge_context_.GetNumOperands() == keys_.size()); + std::string merge_result; + s = TimedFullMerge(user_merge_operator_, orig_ikey.user_key, nullptr, + merge_context_.GetOperands(), &merge_result, logger_, + stats_, env_); + if (s.ok()) { + // The original key encountered + // We are certain that keys_ is not empty here (see assertions couple of + // lines before). + original_key = std::move(keys_.back()); + orig_ikey.type = kTypeValue; + UpdateInternalKey(&original_key, orig_ikey.sequence, orig_ikey.type); + keys_.clear(); + merge_context_.Clear(); + keys_.emplace_front(std::move(original_key)); + merge_context_.PushOperand(merge_result); + } + } else { + // We haven't seen the beginning of the key nor a Put/Delete. + // Attempt to use the user's associative merge function to + // merge the stacked merge operands into a single operand. + s = Status::MergeInProgress(); + if (merge_context_.GetNumOperands() >= 2 || + (allow_single_operand_ && merge_context_.GetNumOperands() == 1)) { + bool merge_success = false; + std::string merge_result; + { + StopWatchNano timer(env_, stats_ != nullptr); + PERF_TIMER_GUARD(merge_operator_time_nanos); + merge_success = user_merge_operator_->PartialMergeMulti( + orig_ikey.user_key, + std::deque(merge_context_.GetOperands().begin(), + merge_context_.GetOperands().end()), + &merge_result, logger_); + RecordTick(stats_, MERGE_OPERATION_TOTAL_TIME, + stats_ ? timer.ElapsedNanosSafe() : 0); + } + if (merge_success) { + // Merging of operands (associative merge) was successful. + // Replace operands with the merge result + merge_context_.Clear(); + merge_context_.PushOperand(merge_result); + keys_.erase(keys_.begin(), keys_.end() - 1); + } + } + } + + return s; +} + +MergeOutputIterator::MergeOutputIterator(const MergeHelper* merge_helper) + : merge_helper_(merge_helper) { + it_keys_ = merge_helper_->keys().rend(); + it_values_ = merge_helper_->values().rend(); +} + +void MergeOutputIterator::SeekToFirst() { + const auto& keys = merge_helper_->keys(); + const auto& values = merge_helper_->values(); + assert(keys.size() == values.size()); + it_keys_ = keys.rbegin(); + it_values_ = values.rbegin(); +} + +void MergeOutputIterator::Next() { + ++it_keys_; + ++it_values_; +} + +CompactionFilter::Decision MergeHelper::FilterMerge(const Slice& user_key, + const Slice& value_slice) { + if (compaction_filter_ == nullptr) { + return CompactionFilter::Decision::kKeep; + } + if (stats_ != nullptr && ShouldReportDetailedTime(env_, stats_)) { + filter_timer_.Start(); + } + compaction_filter_value_.clear(); + compaction_filter_skip_until_.Clear(); + auto ret = compaction_filter_->FilterV2( + level_, user_key, CompactionFilter::ValueType::kMergeOperand, value_slice, + &compaction_filter_value_, compaction_filter_skip_until_.rep()); + if (ret == CompactionFilter::Decision::kRemoveAndSkipUntil) { + if (user_comparator_->Compare(*compaction_filter_skip_until_.rep(), + user_key) <= 0) { + // Invalid skip_until returned from compaction filter. + // Keep the key as per FilterV2 documentation. + ret = CompactionFilter::Decision::kKeep; + } else { + compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber, + kValueTypeForSeek); + } + } + total_filter_time_ += filter_timer_.ElapsedNanosSafe(); + return ret; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/merge_helper.h b/src/rocksdb/db/merge_helper.h new file mode 100644 index 000000000..c0534f08b --- /dev/null +++ b/src/rocksdb/db/merge_helper.h @@ -0,0 +1,194 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once + +#include +#include +#include + +#include "db/dbformat.h" +#include "db/merge_context.h" +#include "db/range_del_aggregator.h" +#include "db/snapshot_checker.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/env.h" +#include "rocksdb/slice.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { + +class Comparator; +class Iterator; +class Logger; +class MergeOperator; +class Statistics; + +class MergeHelper { + public: + MergeHelper(Env* env, const Comparator* user_comparator, + const MergeOperator* user_merge_operator, + const CompactionFilter* compaction_filter, Logger* logger, + bool assert_valid_internal_key, SequenceNumber latest_snapshot, + const SnapshotChecker* snapshot_checker = nullptr, int level = 0, + Statistics* stats = nullptr, + const std::atomic* shutting_down = nullptr); + + // Wrapper around MergeOperator::FullMergeV2() that records perf statistics. + // Result of merge will be written to result if status returned is OK. + // If operands is empty, the value will simply be copied to result. + // Set `update_num_ops_stats` to true if it is from a user read, so that + // the latency is sensitive. + // Returns one of the following statuses: + // - OK: Entries were successfully merged. + // - Corruption: Merge operator reported unsuccessful merge. + static Status TimedFullMerge(const MergeOperator* merge_operator, + const Slice& key, const Slice* value, + const std::vector& operands, + std::string* result, Logger* logger, + Statistics* statistics, Env* env, + Slice* result_operand = nullptr, + bool update_num_ops_stats = false); + + // Merge entries until we hit + // - a corrupted key + // - a Put/Delete, + // - a different user key, + // - a specific sequence number (snapshot boundary), + // - REMOVE_AND_SKIP_UNTIL returned from compaction filter, + // or - the end of iteration + // iter: (IN) points to the first merge type entry + // (OUT) points to the first entry not included in the merge process + // range_del_agg: (IN) filters merge operands covered by range tombstones. + // stop_before: (IN) a sequence number that merge should not cross. + // 0 means no restriction + // at_bottom: (IN) true if the iterator covers the bottem level, which means + // we could reach the start of the history of this user key. + // + // Returns one of the following statuses: + // - OK: Entries were successfully merged. + // - MergeInProgress: Put/Delete not encountered, and didn't reach the start + // of key's history. Output consists of merge operands only. + // - Corruption: Merge operator reported unsuccessful merge or a corrupted + // key has been encountered and not expected (applies only when compiling + // with asserts removed). + // - ShutdownInProgress: interrupted by shutdown (*shutting_down == true). + // + // REQUIRED: The first key in the input is not corrupted. + Status MergeUntil(InternalIterator* iter, + CompactionRangeDelAggregator* range_del_agg = nullptr, + const SequenceNumber stop_before = 0, + const bool at_bottom = false); + + // Filters a merge operand using the compaction filter specified + // in the constructor. Returns the decision that the filter made. + // Uses compaction_filter_value_ and compaction_filter_skip_until_ for the + // optional outputs of compaction filter. + CompactionFilter::Decision FilterMerge(const Slice& user_key, + const Slice& value_slice); + + // Query the merge result + // These are valid until the next MergeUntil call + // If the merging was successful: + // - keys() contains a single element with the latest sequence number of + // the merges. The type will be Put or Merge. See IMPORTANT 1 note, below. + // - values() contains a single element with the result of merging all the + // operands together + // + // IMPORTANT 1: the key type could change after the MergeUntil call. + // Put/Delete + Merge + ... + Merge => Put + // Merge + ... + Merge => Merge + // + // If the merge operator is not associative, and if a Put/Delete is not found + // then the merging will be unsuccessful. In this case: + // - keys() contains the list of internal keys seen in order of iteration. + // - values() contains the list of values (merges) seen in the same order. + // values() is parallel to keys() so that the first entry in + // keys() is the key associated with the first entry in values() + // and so on. These lists will be the same length. + // All of these pairs will be merges over the same user key. + // See IMPORTANT 2 note below. + // + // IMPORTANT 2: The entries were traversed in order from BACK to FRONT. + // So keys().back() was the first key seen by iterator. + // TODO: Re-style this comment to be like the first one + const std::deque& keys() const { return keys_; } + const std::vector& values() const { + return merge_context_.GetOperands(); + } + uint64_t TotalFilterTime() const { return total_filter_time_; } + bool HasOperator() const { return user_merge_operator_ != nullptr; } + + // If compaction filter returned REMOVE_AND_SKIP_UNTIL, this method will + // return true and fill *until with the key to which we should skip. + // If true, keys() and values() are empty. + bool FilteredUntil(Slice* skip_until) const { + if (!has_compaction_filter_skip_until_) { + return false; + } + assert(compaction_filter_ != nullptr); + assert(skip_until != nullptr); + assert(compaction_filter_skip_until_.Valid()); + *skip_until = compaction_filter_skip_until_.Encode(); + return true; + } + + private: + Env* env_; + const Comparator* user_comparator_; + const MergeOperator* user_merge_operator_; + const CompactionFilter* compaction_filter_; + const std::atomic* shutting_down_; + Logger* logger_; + bool assert_valid_internal_key_; // enforce no internal key corruption? + bool allow_single_operand_; + SequenceNumber latest_snapshot_; + const SnapshotChecker* const snapshot_checker_; + int level_; + + // the scratch area that holds the result of MergeUntil + // valid up to the next MergeUntil call + + // Keeps track of the sequence of keys seen + std::deque keys_; + // Parallel with keys_; stores the operands + mutable MergeContext merge_context_; + + StopWatchNano filter_timer_; + uint64_t total_filter_time_; + Statistics* stats_; + + bool has_compaction_filter_skip_until_ = false; + std::string compaction_filter_value_; + InternalKey compaction_filter_skip_until_; + + bool IsShuttingDown() { + // This is a best-effort facility, so memory_order_relaxed is sufficient. + return shutting_down_ && shutting_down_->load(std::memory_order_relaxed); + } +}; + +// MergeOutputIterator can be used to iterate over the result of a merge. +class MergeOutputIterator { + public: + // The MergeOutputIterator is bound to a MergeHelper instance. + explicit MergeOutputIterator(const MergeHelper* merge_helper); + + // Seeks to the first record in the output. + void SeekToFirst(); + // Advances to the next record in the output. + void Next(); + + Slice key() { return Slice(*it_keys_); } + Slice value() { return Slice(*it_values_); } + bool Valid() { return it_keys_ != merge_helper_->keys().rend(); } + + private: + const MergeHelper* merge_helper_; + std::deque::const_reverse_iterator it_keys_; + std::vector::const_reverse_iterator it_values_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/merge_helper_test.cc b/src/rocksdb/db/merge_helper_test.cc new file mode 100644 index 000000000..117916c8f --- /dev/null +++ b/src/rocksdb/db/merge_helper_test.cc @@ -0,0 +1,290 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include +#include +#include + +#include "db/merge_helper.h" +#include "rocksdb/comparator.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/coding.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { + +class MergeHelperTest : public testing::Test { + public: + MergeHelperTest() { env_ = Env::Default(); } + + ~MergeHelperTest() override = default; + + Status Run(SequenceNumber stop_before, bool at_bottom, + SequenceNumber latest_snapshot = 0) { + iter_.reset(new test::VectorIterator(ks_, vs_)); + iter_->SeekToFirst(); + merge_helper_.reset(new MergeHelper(env_, BytewiseComparator(), + merge_op_.get(), filter_.get(), nullptr, + false, latest_snapshot)); + return merge_helper_->MergeUntil(iter_.get(), nullptr /* range_del_agg */, + stop_before, at_bottom); + } + + void AddKeyVal(const std::string& user_key, const SequenceNumber& seq, + const ValueType& t, const std::string& val, + bool corrupt = false) { + InternalKey ikey(user_key, seq, t); + if (corrupt) { + test::CorruptKeyType(&ikey); + } + ks_.push_back(ikey.Encode().ToString()); + vs_.push_back(val); + } + + Env* env_; + std::unique_ptr iter_; + std::shared_ptr merge_op_; + std::unique_ptr merge_helper_; + std::vector ks_; + std::vector vs_; + std::unique_ptr filter_; +}; + +// If MergeHelper encounters a new key on the last level, we know that +// the key has no more history and it can merge keys. +TEST_F(MergeHelperTest, MergeAtBottomSuccess) { + merge_op_ = MergeOperators::CreateUInt64AddOperator(); + + AddKeyVal("a", 20, kTypeMerge, test::EncodeInt(1U)); + AddKeyVal("a", 10, kTypeMerge, test::EncodeInt(3U)); + AddKeyVal("b", 10, kTypeMerge, test::EncodeInt(4U)); // <- iter_ after merge + + ASSERT_TRUE(Run(0, true).ok()); + ASSERT_EQ(ks_[2], iter_->key()); + ASSERT_EQ(test::KeyStr("a", 20, kTypeValue), merge_helper_->keys()[0]); + ASSERT_EQ(test::EncodeInt(4U), merge_helper_->values()[0]); + ASSERT_EQ(1U, merge_helper_->keys().size()); + ASSERT_EQ(1U, merge_helper_->values().size()); +} + +// Merging with a value results in a successful merge. +TEST_F(MergeHelperTest, MergeValue) { + merge_op_ = MergeOperators::CreateUInt64AddOperator(); + + AddKeyVal("a", 40, kTypeMerge, test::EncodeInt(1U)); + AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U)); + AddKeyVal("a", 20, kTypeValue, test::EncodeInt(4U)); // <- iter_ after merge + AddKeyVal("a", 10, kTypeMerge, test::EncodeInt(1U)); + + ASSERT_TRUE(Run(0, false).ok()); + ASSERT_EQ(ks_[3], iter_->key()); + ASSERT_EQ(test::KeyStr("a", 40, kTypeValue), merge_helper_->keys()[0]); + ASSERT_EQ(test::EncodeInt(8U), merge_helper_->values()[0]); + ASSERT_EQ(1U, merge_helper_->keys().size()); + ASSERT_EQ(1U, merge_helper_->values().size()); +} + +// Merging stops before a snapshot. +TEST_F(MergeHelperTest, SnapshotBeforeValue) { + merge_op_ = MergeOperators::CreateUInt64AddOperator(); + + AddKeyVal("a", 50, kTypeMerge, test::EncodeInt(1U)); + AddKeyVal("a", 40, kTypeMerge, test::EncodeInt(3U)); // <- iter_ after merge + AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(1U)); + AddKeyVal("a", 20, kTypeValue, test::EncodeInt(4U)); + AddKeyVal("a", 10, kTypeMerge, test::EncodeInt(1U)); + + ASSERT_TRUE(Run(31, true).IsMergeInProgress()); + ASSERT_EQ(ks_[2], iter_->key()); + ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), merge_helper_->keys()[0]); + ASSERT_EQ(test::EncodeInt(4U), merge_helper_->values()[0]); + ASSERT_EQ(1U, merge_helper_->keys().size()); + ASSERT_EQ(1U, merge_helper_->values().size()); +} + +// MergeHelper preserves the operand stack for merge operators that +// cannot do a partial merge. +TEST_F(MergeHelperTest, NoPartialMerge) { + merge_op_ = MergeOperators::CreateStringAppendTESTOperator(); + + AddKeyVal("a", 50, kTypeMerge, "v2"); + AddKeyVal("a", 40, kTypeMerge, "v"); // <- iter_ after merge + AddKeyVal("a", 30, kTypeMerge, "v"); + + ASSERT_TRUE(Run(31, true).IsMergeInProgress()); + ASSERT_EQ(ks_[2], iter_->key()); + ASSERT_EQ(test::KeyStr("a", 40, kTypeMerge), merge_helper_->keys()[0]); + ASSERT_EQ("v", merge_helper_->values()[0]); + ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), merge_helper_->keys()[1]); + ASSERT_EQ("v2", merge_helper_->values()[1]); + ASSERT_EQ(2U, merge_helper_->keys().size()); + ASSERT_EQ(2U, merge_helper_->values().size()); +} + +// A single operand can not be merged. +TEST_F(MergeHelperTest, SingleOperand) { + merge_op_ = MergeOperators::CreateUInt64AddOperator(); + + AddKeyVal("a", 50, kTypeMerge, test::EncodeInt(1U)); + + ASSERT_TRUE(Run(31, false).IsMergeInProgress()); + ASSERT_FALSE(iter_->Valid()); + ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), merge_helper_->keys()[0]); + ASSERT_EQ(test::EncodeInt(1U), merge_helper_->values()[0]); + ASSERT_EQ(1U, merge_helper_->keys().size()); + ASSERT_EQ(1U, merge_helper_->values().size()); +} + +// Merging with a deletion turns the deletion into a value +TEST_F(MergeHelperTest, MergeDeletion) { + merge_op_ = MergeOperators::CreateUInt64AddOperator(); + + AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U)); + AddKeyVal("a", 20, kTypeDeletion, ""); + + ASSERT_TRUE(Run(15, false).ok()); + ASSERT_FALSE(iter_->Valid()); + ASSERT_EQ(test::KeyStr("a", 30, kTypeValue), merge_helper_->keys()[0]); + ASSERT_EQ(test::EncodeInt(3U), merge_helper_->values()[0]); + ASSERT_EQ(1U, merge_helper_->keys().size()); + ASSERT_EQ(1U, merge_helper_->values().size()); +} + +// The merge helper stops upon encountering a corrupt key +TEST_F(MergeHelperTest, CorruptKey) { + merge_op_ = MergeOperators::CreateUInt64AddOperator(); + + AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U)); + AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(1U)); + // Corrupt key + AddKeyVal("a", 20, kTypeDeletion, "", true); // <- iter_ after merge + + ASSERT_TRUE(Run(15, false).IsMergeInProgress()); + ASSERT_EQ(ks_[2], iter_->key()); + ASSERT_EQ(test::KeyStr("a", 30, kTypeMerge), merge_helper_->keys()[0]); + ASSERT_EQ(test::EncodeInt(4U), merge_helper_->values()[0]); + ASSERT_EQ(1U, merge_helper_->keys().size()); + ASSERT_EQ(1U, merge_helper_->values().size()); +} + +// The compaction filter is called on every merge operand +TEST_F(MergeHelperTest, FilterMergeOperands) { + merge_op_ = MergeOperators::CreateUInt64AddOperator(); + filter_.reset(new test::FilterNumber(5U)); + + AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U)); + AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(5U)); // Filtered + AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(3U)); + AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(1U)); + AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U)); // Filtered + AddKeyVal("a", 25, kTypeValue, test::EncodeInt(1U)); + + ASSERT_TRUE(Run(15, false).ok()); + ASSERT_FALSE(iter_->Valid()); + MergeOutputIterator merge_output_iter(merge_helper_.get()); + merge_output_iter.SeekToFirst(); + ASSERT_EQ(test::KeyStr("a", 30, kTypeValue), + merge_output_iter.key().ToString()); + ASSERT_EQ(test::EncodeInt(8U), merge_output_iter.value().ToString()); + merge_output_iter.Next(); + ASSERT_FALSE(merge_output_iter.Valid()); +} + +TEST_F(MergeHelperTest, FilterAllMergeOperands) { + merge_op_ = MergeOperators::CreateUInt64AddOperator(); + filter_.reset(new test::FilterNumber(5U)); + + AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(5U)); + AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(5U)); + AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(5U)); + AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(5U)); + AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U)); + AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(5U)); + + // filtered out all + ASSERT_TRUE(Run(15, false).ok()); + ASSERT_FALSE(iter_->Valid()); + MergeOutputIterator merge_output_iter(merge_helper_.get()); + merge_output_iter.SeekToFirst(); + ASSERT_FALSE(merge_output_iter.Valid()); + + // we have one operand that will survive because it's a delete + AddKeyVal("a", 24, kTypeDeletion, test::EncodeInt(5U)); + AddKeyVal("b", 23, kTypeValue, test::EncodeInt(5U)); + ASSERT_TRUE(Run(15, true).ok()); + merge_output_iter = MergeOutputIterator(merge_helper_.get()); + ASSERT_TRUE(iter_->Valid()); + merge_output_iter.SeekToFirst(); + ASSERT_FALSE(merge_output_iter.Valid()); + + // when all merge operands are filtered out, we leave the iterator pointing to + // the Put/Delete that survived + ASSERT_EQ(test::KeyStr("a", 24, kTypeDeletion), iter_->key().ToString()); + ASSERT_EQ(test::EncodeInt(5U), iter_->value().ToString()); +} + +// Make sure that merge operands are filtered at the beginning +TEST_F(MergeHelperTest, FilterFirstMergeOperand) { + merge_op_ = MergeOperators::CreateUInt64AddOperator(); + filter_.reset(new test::FilterNumber(5U)); + + AddKeyVal("a", 31, kTypeMerge, test::EncodeInt(5U)); // Filtered + AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(5U)); // Filtered + AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(2U)); + AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(1U)); + AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(3U)); + AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U)); // Filtered + AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(5U)); // Filtered + AddKeyVal("b", 24, kTypeValue, test::EncodeInt(5U)); // next user key + + ASSERT_OK(Run(15, true)); + ASSERT_TRUE(iter_->Valid()); + MergeOutputIterator merge_output_iter(merge_helper_.get()); + merge_output_iter.SeekToFirst(); + // sequence number is 29 here, because the first merge operand got filtered + // out + ASSERT_EQ(test::KeyStr("a", 29, kTypeValue), + merge_output_iter.key().ToString()); + ASSERT_EQ(test::EncodeInt(6U), merge_output_iter.value().ToString()); + merge_output_iter.Next(); + ASSERT_FALSE(merge_output_iter.Valid()); + + // make sure that we're passing user keys into the filter + ASSERT_EQ("a", filter_->last_merge_operand_key()); +} + +// Make sure that merge operands are not filtered out if there's a snapshot +// pointing at them +TEST_F(MergeHelperTest, DontFilterMergeOperandsBeforeSnapshotTest) { + merge_op_ = MergeOperators::CreateUInt64AddOperator(); + filter_.reset(new test::FilterNumber(5U)); + + AddKeyVal("a", 31, kTypeMerge, test::EncodeInt(5U)); + AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(5U)); + AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(2U)); + AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(1U)); + AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(3U)); + AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U)); + AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(5U)); + AddKeyVal("b", 24, kTypeValue, test::EncodeInt(5U)); + + ASSERT_OK(Run(15, true, 32)); + ASSERT_TRUE(iter_->Valid()); + MergeOutputIterator merge_output_iter(merge_helper_.get()); + merge_output_iter.SeekToFirst(); + ASSERT_EQ(test::KeyStr("a", 31, kTypeValue), + merge_output_iter.key().ToString()); + ASSERT_EQ(test::EncodeInt(26U), merge_output_iter.value().ToString()); + merge_output_iter.Next(); + ASSERT_FALSE(merge_output_iter.Valid()); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/merge_operator.cc b/src/rocksdb/db/merge_operator.cc new file mode 100644 index 000000000..75dea432c --- /dev/null +++ b/src/rocksdb/db/merge_operator.cc @@ -0,0 +1,86 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +/** + * Back-end implementation details specific to the Merge Operator. + */ + +#include "rocksdb/merge_operator.h" + +namespace ROCKSDB_NAMESPACE { + +bool MergeOperator::FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const { + // If FullMergeV2 is not implemented, we convert the operand_list to + // std::deque and pass it to FullMerge + std::deque operand_list_str; + for (auto& op : merge_in.operand_list) { + operand_list_str.emplace_back(op.data(), op.size()); + } + return FullMerge(merge_in.key, merge_in.existing_value, operand_list_str, + &merge_out->new_value, merge_in.logger); +} + +// The default implementation of PartialMergeMulti, which invokes +// PartialMerge multiple times internally and merges two operands at +// a time. +bool MergeOperator::PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, + Logger* logger) const { + assert(operand_list.size() >= 2); + // Simply loop through the operands + Slice temp_slice(operand_list[0]); + + for (size_t i = 1; i < operand_list.size(); ++i) { + auto& operand = operand_list[i]; + std::string temp_value; + if (!PartialMerge(key, temp_slice, operand, &temp_value, logger)) { + return false; + } + swap(temp_value, *new_value); + temp_slice = Slice(*new_value); + } + + // The result will be in *new_value. All merges succeeded. + return true; +} + +// Given a "real" merge from the library, call the user's +// associative merge function one-by-one on each of the operands. +// NOTE: It is assumed that the client's merge-operator will handle any errors. +bool AssociativeMergeOperator::FullMergeV2( + const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const { + // Simply loop through the operands + Slice temp_existing; + const Slice* existing_value = merge_in.existing_value; + for (const auto& operand : merge_in.operand_list) { + std::string temp_value; + if (!Merge(merge_in.key, existing_value, operand, &temp_value, + merge_in.logger)) { + return false; + } + swap(temp_value, merge_out->new_value); + temp_existing = Slice(merge_out->new_value); + existing_value = &temp_existing; + } + + // The result will be in *new_value. All merges succeeded. + return true; +} + +// Call the user defined simple merge on the operands; +// NOTE: It is assumed that the client's merge-operator will handle any errors. +bool AssociativeMergeOperator::PartialMerge( + const Slice& key, + const Slice& left_operand, + const Slice& right_operand, + std::string* new_value, + Logger* logger) const { + return Merge(key, &left_operand, right_operand, new_value, logger); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/merge_test.cc b/src/rocksdb/db/merge_test.cc new file mode 100644 index 000000000..3f85f6464 --- /dev/null +++ b/src/rocksdb/db/merge_test.cc @@ -0,0 +1,504 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "db/dbformat.h" +#include "db/write_batch_internal.h" +#include "port/stack_trace.h" +#include "rocksdb/cache.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/utilities/db_ttl.h" +#include "test_util/testharness.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { + +bool use_compression; + +class MergeTest : public testing::Test {}; + +size_t num_merge_operator_calls; +void resetNumMergeOperatorCalls() { num_merge_operator_calls = 0; } + +size_t num_partial_merge_calls; +void resetNumPartialMergeCalls() { num_partial_merge_calls = 0; } + +class CountMergeOperator : public AssociativeMergeOperator { + public: + CountMergeOperator() { + mergeOperator_ = MergeOperators::CreateUInt64AddOperator(); + } + + bool Merge(const Slice& key, const Slice* existing_value, const Slice& value, + std::string* new_value, Logger* logger) const override { + assert(new_value->empty()); + ++num_merge_operator_calls; + if (existing_value == nullptr) { + new_value->assign(value.data(), value.size()); + return true; + } + + return mergeOperator_->PartialMerge( + key, + *existing_value, + value, + new_value, + logger); + } + + bool PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, + Logger* logger) const override { + assert(new_value->empty()); + ++num_partial_merge_calls; + return mergeOperator_->PartialMergeMulti(key, operand_list, new_value, + logger); + } + + const char* Name() const override { return "UInt64AddOperator"; } + + private: + std::shared_ptr mergeOperator_; +}; + +std::shared_ptr OpenDb(const std::string& dbname, const bool ttl = false, + const size_t max_successive_merges = 0) { + DB* db; + Options options; + options.create_if_missing = true; + options.merge_operator = std::make_shared(); + options.max_successive_merges = max_successive_merges; + Status s; + DestroyDB(dbname, Options()); +// DBWithTTL is not supported in ROCKSDB_LITE +#ifndef ROCKSDB_LITE + if (ttl) { + DBWithTTL* db_with_ttl; + s = DBWithTTL::Open(options, dbname, &db_with_ttl); + db = db_with_ttl; + } else { + s = DB::Open(options, dbname, &db); + } +#else + assert(!ttl); + s = DB::Open(options, dbname, &db); +#endif // !ROCKSDB_LITE + if (!s.ok()) { + std::cerr << s.ToString() << std::endl; + assert(false); + } + return std::shared_ptr(db); +} + +// Imagine we are maintaining a set of uint64 counters. +// Each counter has a distinct name. And we would like +// to support four high level operations: +// set, add, get and remove +// This is a quick implementation without a Merge operation. +class Counters { + + protected: + std::shared_ptr db_; + + WriteOptions put_option_; + ReadOptions get_option_; + WriteOptions delete_option_; + + uint64_t default_; + + public: + explicit Counters(std::shared_ptr db, uint64_t defaultCount = 0) + : db_(db), + put_option_(), + get_option_(), + delete_option_(), + default_(defaultCount) { + assert(db_); + } + + virtual ~Counters() {} + + // public interface of Counters. + // All four functions return false + // if the underlying level db operation failed. + + // mapped to a levedb Put + bool set(const std::string& key, uint64_t value) { + // just treat the internal rep of int64 as the string + char buf[sizeof(value)]; + EncodeFixed64(buf, value); + Slice slice(buf, sizeof(value)); + auto s = db_->Put(put_option_, key, slice); + + if (s.ok()) { + return true; + } else { + std::cerr << s.ToString() << std::endl; + return false; + } + } + + // mapped to a rocksdb Delete + bool remove(const std::string& key) { + auto s = db_->Delete(delete_option_, key); + + if (s.ok()) { + return true; + } else { + std::cerr << s.ToString() << std::endl; + return false; + } + } + + // mapped to a rocksdb Get + bool get(const std::string& key, uint64_t* value) { + std::string str; + auto s = db_->Get(get_option_, key, &str); + + if (s.IsNotFound()) { + // return default value if not found; + *value = default_; + return true; + } else if (s.ok()) { + // deserialization + if (str.size() != sizeof(uint64_t)) { + std::cerr << "value corruption\n"; + return false; + } + *value = DecodeFixed64(&str[0]); + return true; + } else { + std::cerr << s.ToString() << std::endl; + return false; + } + } + + // 'add' is implemented as get -> modify -> set + // An alternative is a single merge operation, see MergeBasedCounters + virtual bool add(const std::string& key, uint64_t value) { + uint64_t base = default_; + return get(key, &base) && set(key, base + value); + } + + + // convenience functions for testing + void assert_set(const std::string& key, uint64_t value) { + assert(set(key, value)); + } + + void assert_remove(const std::string& key) { assert(remove(key)); } + + uint64_t assert_get(const std::string& key) { + uint64_t value = default_; + int result = get(key, &value); + assert(result); + if (result == 0) exit(1); // Disable unused variable warning. + return value; + } + + void assert_add(const std::string& key, uint64_t value) { + int result = add(key, value); + assert(result); + if (result == 0) exit(1); // Disable unused variable warning. + } +}; + +// Implement 'add' directly with the new Merge operation +class MergeBasedCounters : public Counters { + private: + WriteOptions merge_option_; // for merge + + public: + explicit MergeBasedCounters(std::shared_ptr db, uint64_t defaultCount = 0) + : Counters(db, defaultCount), + merge_option_() { + } + + // mapped to a rocksdb Merge operation + bool add(const std::string& key, uint64_t value) override { + char encoded[sizeof(uint64_t)]; + EncodeFixed64(encoded, value); + Slice slice(encoded, sizeof(uint64_t)); + auto s = db_->Merge(merge_option_, key, slice); + + if (s.ok()) { + return true; + } else { + std::cerr << s.ToString() << std::endl; + return false; + } + } +}; + +void dumpDb(DB* db) { + auto it = std::unique_ptr(db->NewIterator(ReadOptions())); + for (it->SeekToFirst(); it->Valid(); it->Next()) { + //uint64_t value = DecodeFixed64(it->value().data()); + //std::cout << it->key().ToString() << ": " << value << std::endl; + } + assert(it->status().ok()); // Check for any errors found during the scan +} + +void testCounters(Counters& counters, DB* db, bool test_compaction) { + + FlushOptions o; + o.wait = true; + + counters.assert_set("a", 1); + + if (test_compaction) db->Flush(o); + + assert(counters.assert_get("a") == 1); + + counters.assert_remove("b"); + + // defaut value is 0 if non-existent + assert(counters.assert_get("b") == 0); + + counters.assert_add("a", 2); + + if (test_compaction) db->Flush(o); + + // 1+2 = 3 + assert(counters.assert_get("a")== 3); + + dumpDb(db); + + // 1+...+49 = ? + uint64_t sum = 0; + for (int i = 1; i < 50; i++) { + counters.assert_add("b", i); + sum += i; + } + assert(counters.assert_get("b") == sum); + + dumpDb(db); + + if (test_compaction) { + db->Flush(o); + + db->CompactRange(CompactRangeOptions(), nullptr, nullptr); + + dumpDb(db); + + assert(counters.assert_get("a")== 3); + assert(counters.assert_get("b") == sum); + } +} + +void testSuccessiveMerge(Counters& counters, size_t max_num_merges, + size_t num_merges) { + + counters.assert_remove("z"); + uint64_t sum = 0; + + for (size_t i = 1; i <= num_merges; ++i) { + resetNumMergeOperatorCalls(); + counters.assert_add("z", i); + sum += i; + + if (i % (max_num_merges + 1) == 0) { + assert(num_merge_operator_calls == max_num_merges + 1); + } else { + assert(num_merge_operator_calls == 0); + } + + resetNumMergeOperatorCalls(); + assert(counters.assert_get("z") == sum); + assert(num_merge_operator_calls == i % (max_num_merges + 1)); + } +} + +void testPartialMerge(Counters* counters, DB* db, size_t max_merge, + size_t min_merge, size_t count) { + FlushOptions o; + o.wait = true; + + // Test case 1: partial merge should be called when the number of merge + // operands exceeds the threshold. + uint64_t tmp_sum = 0; + resetNumPartialMergeCalls(); + for (size_t i = 1; i <= count; i++) { + counters->assert_add("b", i); + tmp_sum += i; + } + db->Flush(o); + db->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_EQ(tmp_sum, counters->assert_get("b")); + if (count > max_merge) { + // in this case, FullMerge should be called instead. + ASSERT_EQ(num_partial_merge_calls, 0U); + } else { + // if count >= min_merge, then partial merge should be called once. + ASSERT_EQ((count >= min_merge), (num_partial_merge_calls == 1)); + } + + // Test case 2: partial merge should not be called when a put is found. + resetNumPartialMergeCalls(); + tmp_sum = 0; + db->Put(ROCKSDB_NAMESPACE::WriteOptions(), "c", "10"); + for (size_t i = 1; i <= count; i++) { + counters->assert_add("c", i); + tmp_sum += i; + } + db->Flush(o); + db->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_EQ(tmp_sum, counters->assert_get("c")); + ASSERT_EQ(num_partial_merge_calls, 0U); +} + +void testSingleBatchSuccessiveMerge(DB* db, size_t max_num_merges, + size_t num_merges) { + assert(num_merges > max_num_merges); + + Slice key("BatchSuccessiveMerge"); + uint64_t merge_value = 1; + char buf[sizeof(merge_value)]; + EncodeFixed64(buf, merge_value); + Slice merge_value_slice(buf, sizeof(merge_value)); + + // Create the batch + WriteBatch batch; + for (size_t i = 0; i < num_merges; ++i) { + batch.Merge(key, merge_value_slice); + } + + // Apply to memtable and count the number of merges + resetNumMergeOperatorCalls(); + { + Status s = db->Write(WriteOptions(), &batch); + assert(s.ok()); + } + ASSERT_EQ( + num_merge_operator_calls, + static_cast(num_merges - (num_merges % (max_num_merges + 1)))); + + // Get the value + resetNumMergeOperatorCalls(); + std::string get_value_str; + { + Status s = db->Get(ReadOptions(), key, &get_value_str); + assert(s.ok()); + } + assert(get_value_str.size() == sizeof(uint64_t)); + uint64_t get_value = DecodeFixed64(&get_value_str[0]); + ASSERT_EQ(get_value, num_merges * merge_value); + ASSERT_EQ(num_merge_operator_calls, + static_cast((num_merges % (max_num_merges + 1)))); +} + +void runTest(const std::string& dbname, const bool use_ttl = false) { + + { + auto db = OpenDb(dbname, use_ttl); + + { + Counters counters(db, 0); + testCounters(counters, db.get(), true); + } + + { + MergeBasedCounters counters(db, 0); + testCounters(counters, db.get(), use_compression); + } + } + + DestroyDB(dbname, Options()); + + { + size_t max_merge = 5; + auto db = OpenDb(dbname, use_ttl, max_merge); + MergeBasedCounters counters(db, 0); + testCounters(counters, db.get(), use_compression); + testSuccessiveMerge(counters, max_merge, max_merge * 2); + testSingleBatchSuccessiveMerge(db.get(), 5, 7); + DestroyDB(dbname, Options()); + } + + { + size_t max_merge = 100; + // Min merge is hard-coded to 2. + uint32_t min_merge = 2; + for (uint32_t count = min_merge - 1; count <= min_merge + 1; count++) { + auto db = OpenDb(dbname, use_ttl, max_merge); + MergeBasedCounters counters(db, 0); + testPartialMerge(&counters, db.get(), max_merge, min_merge, count); + DestroyDB(dbname, Options()); + } + { + auto db = OpenDb(dbname, use_ttl, max_merge); + MergeBasedCounters counters(db, 0); + testPartialMerge(&counters, db.get(), max_merge, min_merge, + min_merge * 10); + DestroyDB(dbname, Options()); + } + } + + { + { + auto db = OpenDb(dbname); + MergeBasedCounters counters(db, 0); + counters.add("test-key", 1); + counters.add("test-key", 1); + counters.add("test-key", 1); + db->CompactRange(CompactRangeOptions(), nullptr, nullptr); + } + + DB* reopen_db; + ASSERT_OK(DB::Open(Options(), dbname, &reopen_db)); + std::string value; + ASSERT_TRUE(!(reopen_db->Get(ReadOptions(), "test-key", &value).ok())); + delete reopen_db; + DestroyDB(dbname, Options()); + } + + /* Temporary remove this test + { + std::cout << "Test merge-operator not set after reopen (recovery case)\n"; + { + auto db = OpenDb(dbname); + MergeBasedCounters counters(db, 0); + counters.add("test-key", 1); + counters.add("test-key", 1); + counters.add("test-key", 1); + } + + DB* reopen_db; + ASSERT_TRUE(DB::Open(Options(), dbname, &reopen_db).IsInvalidArgument()); + } + */ +} + +TEST_F(MergeTest, MergeDbTest) { + runTest(test::PerThreadDBPath("merge_testdb")); +} + +#ifndef ROCKSDB_LITE +TEST_F(MergeTest, MergeDbTtlTest) { + runTest(test::PerThreadDBPath("merge_testdbttl"), + true); // Run test on TTL database +} +#endif // !ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::use_compression = false; + if (argc > 1) { + ROCKSDB_NAMESPACE::use_compression = true; + } + + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/obsolete_files_test.cc b/src/rocksdb/db/obsolete_files_test.cc new file mode 100644 index 000000000..bf018a0e3 --- /dev/null +++ b/src/rocksdb/db/obsolete_files_test.cc @@ -0,0 +1,222 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include +#include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "file/filename.h" +#include "port/stack_trace.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/transaction_log.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/string_util.h" + +using std::cerr; +using std::cout; +using std::endl; +using std::flush; + +namespace ROCKSDB_NAMESPACE { + +class ObsoleteFilesTest : public DBTestBase { + public: + ObsoleteFilesTest() + : DBTestBase("/obsolete_files_test"), wal_dir_(dbname_ + "/wal_files") {} + + void AddKeys(int numkeys, int startkey) { + WriteOptions options; + options.sync = false; + for (int i = startkey; i < (numkeys + startkey) ; i++) { + std::string temp = ToString(i); + Slice key(temp); + Slice value(temp); + ASSERT_OK(db_->Put(options, key, value)); + } + } + + void createLevel0Files(int numFiles, int numKeysPerFile) { + int startKey = 0; + for (int i = 0; i < numFiles; i++) { + AddKeys(numKeysPerFile, startKey); + startKey += numKeysPerFile; + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + } + + void CheckFileTypeCounts(const std::string& dir, int required_log, + int required_sst, int required_manifest) { + std::vector filenames; + env_->GetChildren(dir, &filenames); + + int log_cnt = 0; + int sst_cnt = 0; + int manifest_cnt = 0; + for (auto file : filenames) { + uint64_t number; + FileType type; + if (ParseFileName(file, &number, &type)) { + log_cnt += (type == kLogFile); + sst_cnt += (type == kTableFile); + manifest_cnt += (type == kDescriptorFile); + } + } + ASSERT_EQ(required_log, log_cnt); + ASSERT_EQ(required_sst, sst_cnt); + ASSERT_EQ(required_manifest, manifest_cnt); + } + + void ReopenDB() { + Options options = CurrentOptions(); + // Trigger compaction when the number of level 0 files reaches 2. + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = 2; + options.disable_auto_compactions = false; + options.delete_obsolete_files_period_micros = 0; // always do full purge + options.enable_thread_tracking = true; + options.write_buffer_size = 1024 * 1024 * 1000; + options.target_file_size_base = 1024 * 1024 * 1000; + options.max_bytes_for_level_base = 1024 * 1024 * 1000; + options.WAL_ttl_seconds = 300; // Used to test log files + options.WAL_size_limit_MB = 1024; // Used to test log files + options.wal_dir = wal_dir_; + Destroy(options); + Reopen(options); + } + + const std::string wal_dir_; +}; + +TEST_F(ObsoleteFilesTest, RaceForObsoleteFileDeletion) { + ReopenDB(); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency({ + {"DBImpl::BackgroundCallCompaction:FoundObsoleteFiles", + "ObsoleteFilesTest::RaceForObsoleteFileDeletion:1"}, + {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles", + "ObsoleteFilesTest::RaceForObsoleteFileDeletion:2"}, + }); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::DeleteObsoleteFileImpl:AfterDeletion", [&](void* arg) { + Status* p_status = reinterpret_cast(arg); + ASSERT_OK(*p_status); + }); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::CloseHelper:PendingPurgeFinished", [&](void* arg) { + std::unordered_set* files_grabbed_for_purge_ptr = + reinterpret_cast*>(arg); + ASSERT_TRUE(files_grabbed_for_purge_ptr->empty()); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + createLevel0Files(2, 50000); + CheckFileTypeCounts(wal_dir_, 1, 0, 0); + + port::Thread user_thread([this]() { + JobContext jobCxt(0); + TEST_SYNC_POINT("ObsoleteFilesTest::RaceForObsoleteFileDeletion:1"); + dbfull()->TEST_LockMutex(); + dbfull()->FindObsoleteFiles(&jobCxt, true /* force=true */, + false /* no_full_scan=false */); + dbfull()->TEST_UnlockMutex(); + TEST_SYNC_POINT("ObsoleteFilesTest::RaceForObsoleteFileDeletion:2"); + dbfull()->PurgeObsoleteFiles(jobCxt); + jobCxt.Clean(); + }); + + user_thread.join(); +} + +TEST_F(ObsoleteFilesTest, DeleteObsoleteOptionsFile) { + ReopenDB(); + SyncPoint::GetInstance()->DisableProcessing(); + std::vector optsfiles_nums; + std::vector optsfiles_keep; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::PurgeObsoleteFiles:CheckOptionsFiles:1", [&](void* arg) { + optsfiles_nums.push_back(*reinterpret_cast(arg)); + }); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::PurgeObsoleteFiles:CheckOptionsFiles:2", [&](void* arg) { + optsfiles_keep.push_back(*reinterpret_cast(arg)); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + createLevel0Files(2, 50000); + CheckFileTypeCounts(wal_dir_, 1, 0, 0); + + ASSERT_OK(dbfull()->DisableFileDeletions()); + for (int i = 0; i != 4; ++i) { + if (i % 2) { + ASSERT_OK(dbfull()->SetOptions(dbfull()->DefaultColumnFamily(), + {{"paranoid_file_checks", "false"}})); + } else { + ASSERT_OK(dbfull()->SetOptions(dbfull()->DefaultColumnFamily(), + {{"paranoid_file_checks", "true"}})); + } + } + ASSERT_OK(dbfull()->EnableFileDeletions(true /* force */)); + ASSERT_EQ(optsfiles_nums.size(), optsfiles_keep.size()); + + Close(); + + std::vector files; + int opts_file_count = 0; + ASSERT_OK(env_->GetChildren(dbname_, &files)); + for (const auto& file : files) { + uint64_t file_num; + Slice dummy_info_log_name_prefix; + FileType type; + WalFileType log_type; + if (ParseFileName(file, &file_num, dummy_info_log_name_prefix, &type, + &log_type) && + type == kOptionsFile) { + opts_file_count++; + } + } + ASSERT_EQ(2, opts_file_count); +} + +} // namespace ROCKSDB_NAMESPACE + +#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS +extern "C" { +void RegisterCustomObjects(int argc, char** argv); +} +#else +void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {} +#endif // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, + "SKIPPED as DBImpl::DeleteFile is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/options_file_test.cc b/src/rocksdb/db/options_file_test.cc new file mode 100644 index 000000000..00427de8a --- /dev/null +++ b/src/rocksdb/db/options_file_test.cc @@ -0,0 +1,119 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE +#include + +#include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" +#include "test_util/testharness.h" + +namespace ROCKSDB_NAMESPACE { +class OptionsFileTest : public testing::Test { + public: + OptionsFileTest() : dbname_(test::PerThreadDBPath("options_file_test")) {} + + std::string dbname_; +}; + +namespace { +void UpdateOptionsFiles(DB* db, + std::unordered_set* filename_history, + int* options_files_count) { + std::vector filenames; + db->GetEnv()->GetChildren(db->GetName(), &filenames); + uint64_t number; + FileType type; + *options_files_count = 0; + for (auto filename : filenames) { + if (ParseFileName(filename, &number, &type) && type == kOptionsFile) { + filename_history->insert(filename); + (*options_files_count)++; + } + } +} + +// Verify whether the current Options Files are the latest ones. +void VerifyOptionsFileName( + DB* db, const std::unordered_set& past_filenames) { + std::vector filenames; + std::unordered_set current_filenames; + db->GetEnv()->GetChildren(db->GetName(), &filenames); + uint64_t number; + FileType type; + for (auto filename : filenames) { + if (ParseFileName(filename, &number, &type) && type == kOptionsFile) { + current_filenames.insert(filename); + } + } + for (auto past_filename : past_filenames) { + if (current_filenames.find(past_filename) != current_filenames.end()) { + continue; + } + for (auto filename : current_filenames) { + ASSERT_GT(filename, past_filename); + } + } +} +} // namespace + +TEST_F(OptionsFileTest, NumberOfOptionsFiles) { + const int kReopenCount = 20; + Options opt; + opt.create_if_missing = true; + DestroyDB(dbname_, opt); + std::unordered_set filename_history; + DB* db; + for (int i = 0; i < kReopenCount; ++i) { + ASSERT_OK(DB::Open(opt, dbname_, &db)); + int num_options_files = 0; + UpdateOptionsFiles(db, &filename_history, &num_options_files); + ASSERT_GT(num_options_files, 0); + ASSERT_LE(num_options_files, 2); + // Make sure we always keep the latest option files. + VerifyOptionsFileName(db, filename_history); + delete db; + } +} + +TEST_F(OptionsFileTest, OptionsFileName) { + const uint64_t kOptionsFileNum = 12345; + uint64_t number; + FileType type; + + auto options_file_name = OptionsFileName("", kOptionsFileNum); + ASSERT_TRUE(ParseFileName(options_file_name, &number, &type, nullptr)); + ASSERT_EQ(type, kOptionsFile); + ASSERT_EQ(number, kOptionsFileNum); + + const uint64_t kTempOptionsFileNum = 54352; + auto temp_options_file_name = TempOptionsFileName("", kTempOptionsFileNum); + ASSERT_TRUE(ParseFileName(temp_options_file_name, &number, &type, nullptr)); + ASSERT_NE(temp_options_file_name.find(kTempFileNameSuffix), + std::string::npos); + ASSERT_EQ(type, kTempFile); + ASSERT_EQ(number, kTempOptionsFileNum); +} +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { +#if !(defined NDEBUG) || !defined(OS_WIN) + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +#else + return 0; +#endif // !(defined NDEBUG) || !defined(OS_WIN) +} +#else + +#include + +int main(int /*argc*/, char** /*argv*/) { + printf("Skipped as Options file is not supported in RocksDBLite.\n"); + return 0; +} +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/perf_context_test.cc b/src/rocksdb/db/perf_context_test.cc new file mode 100644 index 000000000..86f2db7b6 --- /dev/null +++ b/src/rocksdb/db/perf_context_test.cc @@ -0,0 +1,981 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include +#include +#include +#include + +#include "monitoring/histogram.h" +#include "monitoring/instrumented_mutex.h" +#include "monitoring/perf_context_imp.h" +#include "monitoring/thread_status_util.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/slice_transform.h" +#include "test_util/testharness.h" +#include "util/stop_watch.h" +#include "util/string_util.h" +#include "utilities/merge_operators.h" + +bool FLAGS_random_key = false; +bool FLAGS_use_set_based_memetable = false; +int FLAGS_total_keys = 100; +int FLAGS_write_buffer_size = 1000000000; +int FLAGS_max_write_buffer_number = 8; +int FLAGS_min_write_buffer_number_to_merge = 7; +bool FLAGS_verbose = false; + +// Path to the database on file system +const std::string kDbName = + ROCKSDB_NAMESPACE::test::PerThreadDBPath("perf_context_test"); + +namespace ROCKSDB_NAMESPACE { + +std::shared_ptr OpenDb(bool read_only = false) { + DB* db; + Options options; + options.create_if_missing = true; + options.max_open_files = -1; + options.write_buffer_size = FLAGS_write_buffer_size; + options.max_write_buffer_number = FLAGS_max_write_buffer_number; + options.min_write_buffer_number_to_merge = + FLAGS_min_write_buffer_number_to_merge; + + if (FLAGS_use_set_based_memetable) { +#ifndef ROCKSDB_LITE + options.prefix_extractor.reset( + ROCKSDB_NAMESPACE::NewFixedPrefixTransform(0)); + options.memtable_factory.reset(NewHashSkipListRepFactory()); +#endif // ROCKSDB_LITE + } + + Status s; + if (!read_only) { + s = DB::Open(options, kDbName, &db); + } else { + s = DB::OpenForReadOnly(options, kDbName, &db); + } + EXPECT_OK(s); + return std::shared_ptr(db); +} + +class PerfContextTest : public testing::Test {}; + +TEST_F(PerfContextTest, SeekIntoDeletion) { + DestroyDB(kDbName, Options()); + auto db = OpenDb(); + WriteOptions write_options; + ReadOptions read_options; + + for (int i = 0; i < FLAGS_total_keys; ++i) { + std::string key = "k" + ToString(i); + std::string value = "v" + ToString(i); + + db->Put(write_options, key, value); + } + + for (int i = 0; i < FLAGS_total_keys -1 ; ++i) { + std::string key = "k" + ToString(i); + db->Delete(write_options, key); + } + + HistogramImpl hist_get; + HistogramImpl hist_get_time; + for (int i = 0; i < FLAGS_total_keys - 1; ++i) { + std::string key = "k" + ToString(i); + std::string value; + + get_perf_context()->Reset(); + StopWatchNano timer(Env::Default()); + timer.Start(); + auto status = db->Get(read_options, key, &value); + auto elapsed_nanos = timer.ElapsedNanos(); + ASSERT_TRUE(status.IsNotFound()); + hist_get.Add(get_perf_context()->user_key_comparison_count); + hist_get_time.Add(elapsed_nanos); + } + + if (FLAGS_verbose) { + std::cout << "Get user key comparison: \n" << hist_get.ToString() + << "Get time: \n" << hist_get_time.ToString(); + } + + { + HistogramImpl hist_seek_to_first; + std::unique_ptr iter(db->NewIterator(read_options)); + + get_perf_context()->Reset(); + StopWatchNano timer(Env::Default(), true); + iter->SeekToFirst(); + hist_seek_to_first.Add(get_perf_context()->user_key_comparison_count); + auto elapsed_nanos = timer.ElapsedNanos(); + + if (FLAGS_verbose) { + std::cout << "SeekToFirst uesr key comparison: \n" + << hist_seek_to_first.ToString() + << "ikey skipped: " << get_perf_context()->internal_key_skipped_count + << "\n" + << "idelete skipped: " + << get_perf_context()->internal_delete_skipped_count << "\n" + << "elapsed: " << elapsed_nanos << "\n"; + } + } + + HistogramImpl hist_seek; + for (int i = 0; i < FLAGS_total_keys; ++i) { + std::unique_ptr iter(db->NewIterator(read_options)); + std::string key = "k" + ToString(i); + + get_perf_context()->Reset(); + StopWatchNano timer(Env::Default(), true); + iter->Seek(key); + auto elapsed_nanos = timer.ElapsedNanos(); + hist_seek.Add(get_perf_context()->user_key_comparison_count); + if (FLAGS_verbose) { + std::cout << "seek cmp: " << get_perf_context()->user_key_comparison_count + << " ikey skipped " << get_perf_context()->internal_key_skipped_count + << " idelete skipped " + << get_perf_context()->internal_delete_skipped_count + << " elapsed: " << elapsed_nanos << "ns\n"; + } + + get_perf_context()->Reset(); + ASSERT_TRUE(iter->Valid()); + StopWatchNano timer2(Env::Default(), true); + iter->Next(); + auto elapsed_nanos2 = timer2.ElapsedNanos(); + if (FLAGS_verbose) { + std::cout << "next cmp: " << get_perf_context()->user_key_comparison_count + << "elapsed: " << elapsed_nanos2 << "ns\n"; + } + } + + if (FLAGS_verbose) { + std::cout << "Seek uesr key comparison: \n" << hist_seek.ToString(); + } +} + +TEST_F(PerfContextTest, StopWatchNanoOverhead) { + // profile the timer cost by itself! + const int kTotalIterations = 1000000; + std::vector timings(kTotalIterations); + + StopWatchNano timer(Env::Default(), true); + for (auto& timing : timings) { + timing = timer.ElapsedNanos(true /* reset */); + } + + HistogramImpl histogram; + for (const auto timing : timings) { + histogram.Add(timing); + } + + if (FLAGS_verbose) { + std::cout << histogram.ToString(); + } +} + +TEST_F(PerfContextTest, StopWatchOverhead) { + // profile the timer cost by itself! + const int kTotalIterations = 1000000; + uint64_t elapsed = 0; + std::vector timings(kTotalIterations); + + StopWatch timer(Env::Default(), nullptr, 0, &elapsed); + for (auto& timing : timings) { + timing = elapsed; + } + + HistogramImpl histogram; + uint64_t prev_timing = 0; + for (const auto timing : timings) { + histogram.Add(timing - prev_timing); + prev_timing = timing; + } + + if (FLAGS_verbose) { + std::cout << histogram.ToString(); + } +} + +void ProfileQueries(bool enabled_time = false) { + DestroyDB(kDbName, Options()); // Start this test with a fresh DB + + auto db = OpenDb(); + + WriteOptions write_options; + ReadOptions read_options; + + HistogramImpl hist_put; + + HistogramImpl hist_get; + HistogramImpl hist_get_snapshot; + HistogramImpl hist_get_memtable; + HistogramImpl hist_get_files; + HistogramImpl hist_get_post_process; + HistogramImpl hist_num_memtable_checked; + + HistogramImpl hist_mget; + HistogramImpl hist_mget_snapshot; + HistogramImpl hist_mget_memtable; + HistogramImpl hist_mget_files; + HistogramImpl hist_mget_post_process; + HistogramImpl hist_mget_num_memtable_checked; + + HistogramImpl hist_write_pre_post; + HistogramImpl hist_write_wal_time; + HistogramImpl hist_write_memtable_time; + HistogramImpl hist_write_delay_time; + HistogramImpl hist_write_thread_wait_nanos; + HistogramImpl hist_write_scheduling_time; + + uint64_t total_db_mutex_nanos = 0; + + if (FLAGS_verbose) { + std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n"; + } + + std::vector keys; + const int kFlushFlag = -1; + for (int i = 0; i < FLAGS_total_keys; ++i) { + keys.push_back(i); + if (i == FLAGS_total_keys / 2) { + // Issuing a flush in the middle. + keys.push_back(kFlushFlag); + } + } + + if (FLAGS_random_key) { + std::random_shuffle(keys.begin(), keys.end()); + } +#ifndef NDEBUG + ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 1U); +#endif + int num_mutex_waited = 0; + for (const int i : keys) { + if (i == kFlushFlag) { + FlushOptions fo; + db->Flush(fo); + continue; + } + + std::string key = "k" + ToString(i); + std::string value = "v" + ToString(i); + + std::vector values; + + get_perf_context()->Reset(); + db->Put(write_options, key, value); + if (++num_mutex_waited > 3) { +#ifndef NDEBUG + ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0U); +#endif + } + hist_write_pre_post.Add( + get_perf_context()->write_pre_and_post_process_time); + hist_write_wal_time.Add(get_perf_context()->write_wal_time); + hist_write_memtable_time.Add(get_perf_context()->write_memtable_time); + hist_write_delay_time.Add(get_perf_context()->write_delay_time); + hist_write_thread_wait_nanos.Add( + get_perf_context()->write_thread_wait_nanos); + hist_write_scheduling_time.Add( + get_perf_context()->write_scheduling_flushes_compactions_time); + hist_put.Add(get_perf_context()->user_key_comparison_count); + total_db_mutex_nanos += get_perf_context()->db_mutex_lock_nanos; + } +#ifndef NDEBUG + ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0U); +#endif + + for (const int i : keys) { + if (i == kFlushFlag) { + continue; + } + std::string key = "k" + ToString(i); + std::string expected_value = "v" + ToString(i); + std::string value; + + std::vector multiget_keys = {Slice(key)}; + std::vector values; + + get_perf_context()->Reset(); + ASSERT_OK(db->Get(read_options, key, &value)); + ASSERT_EQ(expected_value, value); + hist_get_snapshot.Add(get_perf_context()->get_snapshot_time); + hist_get_memtable.Add(get_perf_context()->get_from_memtable_time); + hist_get_files.Add(get_perf_context()->get_from_output_files_time); + hist_num_memtable_checked.Add(get_perf_context()->get_from_memtable_count); + hist_get_post_process.Add(get_perf_context()->get_post_process_time); + hist_get.Add(get_perf_context()->user_key_comparison_count); + + get_perf_context()->Reset(); + db->MultiGet(read_options, multiget_keys, &values); + hist_mget_snapshot.Add(get_perf_context()->get_snapshot_time); + hist_mget_memtable.Add(get_perf_context()->get_from_memtable_time); + hist_mget_files.Add(get_perf_context()->get_from_output_files_time); + hist_mget_num_memtable_checked.Add(get_perf_context()->get_from_memtable_count); + hist_mget_post_process.Add(get_perf_context()->get_post_process_time); + hist_mget.Add(get_perf_context()->user_key_comparison_count); + } + + if (FLAGS_verbose) { + std::cout << "Put uesr key comparison: \n" << hist_put.ToString() + << "Get uesr key comparison: \n" << hist_get.ToString() + << "MultiGet uesr key comparison: \n" << hist_get.ToString(); + std::cout << "Put(): Pre and Post Process Time: \n" + << hist_write_pre_post.ToString() << " Writing WAL time: \n" + << hist_write_wal_time.ToString() << "\n" + << " Writing Mem Table time: \n" + << hist_write_memtable_time.ToString() << "\n" + << " Write Delay: \n" << hist_write_delay_time.ToString() << "\n" + << " Waiting for Batch time: \n" + << hist_write_thread_wait_nanos.ToString() << "\n" + << " Scheduling Flushes and Compactions Time: \n" + << hist_write_scheduling_time.ToString() << "\n" + << " Total DB mutex nanos: \n" << total_db_mutex_nanos << "\n"; + + std::cout << "Get(): Time to get snapshot: \n" + << hist_get_snapshot.ToString() + << " Time to get value from memtables: \n" + << hist_get_memtable.ToString() << "\n" + << " Time to get value from output files: \n" + << hist_get_files.ToString() << "\n" + << " Number of memtables checked: \n" + << hist_num_memtable_checked.ToString() << "\n" + << " Time to post process: \n" << hist_get_post_process.ToString() + << "\n"; + + std::cout << "MultiGet(): Time to get snapshot: \n" + << hist_mget_snapshot.ToString() + << " Time to get value from memtables: \n" + << hist_mget_memtable.ToString() << "\n" + << " Time to get value from output files: \n" + << hist_mget_files.ToString() << "\n" + << " Number of memtables checked: \n" + << hist_mget_num_memtable_checked.ToString() << "\n" + << " Time to post process: \n" + << hist_mget_post_process.ToString() << "\n"; + } + + if (enabled_time) { + ASSERT_GT(hist_get.Average(), 0); + ASSERT_GT(hist_get_snapshot.Average(), 0); + ASSERT_GT(hist_get_memtable.Average(), 0); + ASSERT_GT(hist_get_files.Average(), 0); + ASSERT_GT(hist_get_post_process.Average(), 0); + ASSERT_GT(hist_num_memtable_checked.Average(), 0); + + ASSERT_GT(hist_mget.Average(), 0); + ASSERT_GT(hist_mget_snapshot.Average(), 0); + ASSERT_GT(hist_mget_memtable.Average(), 0); + ASSERT_GT(hist_mget_files.Average(), 0); + ASSERT_GT(hist_mget_post_process.Average(), 0); + ASSERT_GT(hist_mget_num_memtable_checked.Average(), 0); + + EXPECT_GT(hist_write_pre_post.Average(), 0); + EXPECT_GT(hist_write_wal_time.Average(), 0); + EXPECT_GT(hist_write_memtable_time.Average(), 0); + EXPECT_EQ(hist_write_delay_time.Average(), 0); + EXPECT_EQ(hist_write_thread_wait_nanos.Average(), 0); + EXPECT_GT(hist_write_scheduling_time.Average(), 0); + +#ifndef NDEBUG + ASSERT_GT(total_db_mutex_nanos, 2000U); +#endif + } + + db.reset(); + db = OpenDb(true); + + hist_get.Clear(); + hist_get_snapshot.Clear(); + hist_get_memtable.Clear(); + hist_get_files.Clear(); + hist_get_post_process.Clear(); + hist_num_memtable_checked.Clear(); + + hist_mget.Clear(); + hist_mget_snapshot.Clear(); + hist_mget_memtable.Clear(); + hist_mget_files.Clear(); + hist_mget_post_process.Clear(); + hist_mget_num_memtable_checked.Clear(); + + for (const int i : keys) { + if (i == kFlushFlag) { + continue; + } + std::string key = "k" + ToString(i); + std::string expected_value = "v" + ToString(i); + std::string value; + + std::vector multiget_keys = {Slice(key)}; + std::vector values; + + get_perf_context()->Reset(); + ASSERT_OK(db->Get(read_options, key, &value)); + ASSERT_EQ(expected_value, value); + hist_get_snapshot.Add(get_perf_context()->get_snapshot_time); + hist_get_memtable.Add(get_perf_context()->get_from_memtable_time); + hist_get_files.Add(get_perf_context()->get_from_output_files_time); + hist_num_memtable_checked.Add(get_perf_context()->get_from_memtable_count); + hist_get_post_process.Add(get_perf_context()->get_post_process_time); + hist_get.Add(get_perf_context()->user_key_comparison_count); + + get_perf_context()->Reset(); + db->MultiGet(read_options, multiget_keys, &values); + hist_mget_snapshot.Add(get_perf_context()->get_snapshot_time); + hist_mget_memtable.Add(get_perf_context()->get_from_memtable_time); + hist_mget_files.Add(get_perf_context()->get_from_output_files_time); + hist_mget_num_memtable_checked.Add(get_perf_context()->get_from_memtable_count); + hist_mget_post_process.Add(get_perf_context()->get_post_process_time); + hist_mget.Add(get_perf_context()->user_key_comparison_count); + } + + if (FLAGS_verbose) { + std::cout << "ReadOnly Get uesr key comparison: \n" << hist_get.ToString() + << "ReadOnly MultiGet uesr key comparison: \n" + << hist_mget.ToString(); + + std::cout << "ReadOnly Get(): Time to get snapshot: \n" + << hist_get_snapshot.ToString() + << " Time to get value from memtables: \n" + << hist_get_memtable.ToString() << "\n" + << " Time to get value from output files: \n" + << hist_get_files.ToString() << "\n" + << " Number of memtables checked: \n" + << hist_num_memtable_checked.ToString() << "\n" + << " Time to post process: \n" << hist_get_post_process.ToString() + << "\n"; + + std::cout << "ReadOnly MultiGet(): Time to get snapshot: \n" + << hist_mget_snapshot.ToString() + << " Time to get value from memtables: \n" + << hist_mget_memtable.ToString() << "\n" + << " Time to get value from output files: \n" + << hist_mget_files.ToString() << "\n" + << " Number of memtables checked: \n" + << hist_mget_num_memtable_checked.ToString() << "\n" + << " Time to post process: \n" + << hist_mget_post_process.ToString() << "\n"; + } + + if (enabled_time) { + ASSERT_GT(hist_get.Average(), 0); + ASSERT_GT(hist_get_memtable.Average(), 0); + ASSERT_GT(hist_get_files.Average(), 0); + ASSERT_GT(hist_num_memtable_checked.Average(), 0); + // In read-only mode Get(), no super version operation is needed + ASSERT_EQ(hist_get_post_process.Average(), 0); + ASSERT_GT(hist_get_snapshot.Average(), 0); + + ASSERT_GT(hist_mget.Average(), 0); + ASSERT_GT(hist_mget_snapshot.Average(), 0); + ASSERT_GT(hist_mget_memtable.Average(), 0); + ASSERT_GT(hist_mget_files.Average(), 0); + ASSERT_GT(hist_mget_post_process.Average(), 0); + ASSERT_GT(hist_mget_num_memtable_checked.Average(), 0); + } +} + +#ifndef ROCKSDB_LITE +TEST_F(PerfContextTest, KeyComparisonCount) { + SetPerfLevel(kEnableCount); + ProfileQueries(); + + SetPerfLevel(kDisable); + ProfileQueries(); + + SetPerfLevel(kEnableTime); + ProfileQueries(true); +} +#endif // ROCKSDB_LITE + +// make perf_context_test +// export ROCKSDB_TESTS=PerfContextTest.SeekKeyComparison +// For one memtable: +// ./perf_context_test --write_buffer_size=500000 --total_keys=10000 +// For two memtables: +// ./perf_context_test --write_buffer_size=250000 --total_keys=10000 +// Specify --random_key=1 to shuffle the key before insertion +// Results show that, for sequential insertion, worst-case Seek Key comparison +// is close to the total number of keys (linear), when there is only one +// memtable. When there are two memtables, even the avg Seek Key comparison +// starts to become linear to the input size. + +TEST_F(PerfContextTest, SeekKeyComparison) { + DestroyDB(kDbName, Options()); + auto db = OpenDb(); + WriteOptions write_options; + ReadOptions read_options; + + if (FLAGS_verbose) { + std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n"; + } + + std::vector keys; + for (int i = 0; i < FLAGS_total_keys; ++i) { + keys.push_back(i); + } + + if (FLAGS_random_key) { + std::random_shuffle(keys.begin(), keys.end()); + } + + HistogramImpl hist_put_time; + HistogramImpl hist_wal_time; + HistogramImpl hist_time_diff; + + SetPerfLevel(kEnableTime); + StopWatchNano timer(Env::Default()); + for (const int i : keys) { + std::string key = "k" + ToString(i); + std::string value = "v" + ToString(i); + + get_perf_context()->Reset(); + timer.Start(); + db->Put(write_options, key, value); + auto put_time = timer.ElapsedNanos(); + hist_put_time.Add(put_time); + hist_wal_time.Add(get_perf_context()->write_wal_time); + hist_time_diff.Add(put_time - get_perf_context()->write_wal_time); + } + + if (FLAGS_verbose) { + std::cout << "Put time:\n" << hist_put_time.ToString() << "WAL time:\n" + << hist_wal_time.ToString() << "time diff:\n" + << hist_time_diff.ToString(); + } + + HistogramImpl hist_seek; + HistogramImpl hist_next; + + for (int i = 0; i < FLAGS_total_keys; ++i) { + std::string key = "k" + ToString(i); + std::string value = "v" + ToString(i); + + std::unique_ptr iter(db->NewIterator(read_options)); + get_perf_context()->Reset(); + iter->Seek(key); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->value().ToString(), value); + hist_seek.Add(get_perf_context()->user_key_comparison_count); + } + + std::unique_ptr iter(db->NewIterator(read_options)); + for (iter->SeekToFirst(); iter->Valid();) { + get_perf_context()->Reset(); + iter->Next(); + hist_next.Add(get_perf_context()->user_key_comparison_count); + } + + if (FLAGS_verbose) { + std::cout << "Seek:\n" << hist_seek.ToString() << "Next:\n" + << hist_next.ToString(); + } +} + +TEST_F(PerfContextTest, DBMutexLockCounter) { + int stats_code[] = {0, static_cast(DB_MUTEX_WAIT_MICROS)}; + for (PerfLevel perf_level_test : + {PerfLevel::kEnableTimeExceptForMutex, PerfLevel::kEnableTime}) { + for (int c = 0; c < 2; ++c) { + InstrumentedMutex mutex(nullptr, Env::Default(), stats_code[c]); + mutex.Lock(); + ROCKSDB_NAMESPACE::port::Thread child_thread([&] { + SetPerfLevel(perf_level_test); + get_perf_context()->Reset(); + ASSERT_EQ(get_perf_context()->db_mutex_lock_nanos, 0); + mutex.Lock(); + mutex.Unlock(); + if (perf_level_test == PerfLevel::kEnableTimeExceptForMutex || + stats_code[c] != DB_MUTEX_WAIT_MICROS) { + ASSERT_EQ(get_perf_context()->db_mutex_lock_nanos, 0); + } else { + // increment the counter only when it's a DB Mutex + ASSERT_GT(get_perf_context()->db_mutex_lock_nanos, 0); + } + }); + Env::Default()->SleepForMicroseconds(100); + mutex.Unlock(); + child_thread.join(); + } + } +} + +TEST_F(PerfContextTest, FalseDBMutexWait) { + SetPerfLevel(kEnableTime); + int stats_code[] = {0, static_cast(DB_MUTEX_WAIT_MICROS)}; + for (int c = 0; c < 2; ++c) { + InstrumentedMutex mutex(nullptr, Env::Default(), stats_code[c]); + InstrumentedCondVar lock(&mutex); + get_perf_context()->Reset(); + mutex.Lock(); + lock.TimedWait(100); + mutex.Unlock(); + if (stats_code[c] == static_cast(DB_MUTEX_WAIT_MICROS)) { + // increment the counter only when it's a DB Mutex + ASSERT_GT(get_perf_context()->db_condition_wait_nanos, 0); + } else { + ASSERT_EQ(get_perf_context()->db_condition_wait_nanos, 0); + } + } +} + +TEST_F(PerfContextTest, ToString) { + get_perf_context()->Reset(); + get_perf_context()->block_read_count = 12345; + + std::string zero_included = get_perf_context()->ToString(); + ASSERT_NE(std::string::npos, zero_included.find("= 0")); + ASSERT_NE(std::string::npos, zero_included.find("= 12345")); + + std::string zero_excluded = get_perf_context()->ToString(true); + ASSERT_EQ(std::string::npos, zero_excluded.find("= 0")); + ASSERT_NE(std::string::npos, zero_excluded.find("= 12345")); +} + +TEST_F(PerfContextTest, MergeOperatorTime) { + DestroyDB(kDbName, Options()); + DB* db; + Options options; + options.create_if_missing = true; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + Status s = DB::Open(options, kDbName, &db); + EXPECT_OK(s); + + std::string val; + ASSERT_OK(db->Merge(WriteOptions(), "k1", "val1")); + ASSERT_OK(db->Merge(WriteOptions(), "k1", "val2")); + ASSERT_OK(db->Merge(WriteOptions(), "k1", "val3")); + ASSERT_OK(db->Merge(WriteOptions(), "k1", "val4")); + + SetPerfLevel(kEnableTime); + get_perf_context()->Reset(); + ASSERT_OK(db->Get(ReadOptions(), "k1", &val)); +#ifdef OS_SOLARIS + for (int i = 0; i < 100; i++) { + ASSERT_OK(db->Get(ReadOptions(), "k1", &val)); + } +#endif + EXPECT_GT(get_perf_context()->merge_operator_time_nanos, 0); + + ASSERT_OK(db->Flush(FlushOptions())); + + get_perf_context()->Reset(); + ASSERT_OK(db->Get(ReadOptions(), "k1", &val)); +#ifdef OS_SOLARIS + for (int i = 0; i < 100; i++) { + ASSERT_OK(db->Get(ReadOptions(), "k1", &val)); + } +#endif + EXPECT_GT(get_perf_context()->merge_operator_time_nanos, 0); + + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + get_perf_context()->Reset(); + ASSERT_OK(db->Get(ReadOptions(), "k1", &val)); +#ifdef OS_SOLARIS + for (int i = 0; i < 100; i++) { + ASSERT_OK(db->Get(ReadOptions(), "k1", &val)); + } +#endif + EXPECT_GT(get_perf_context()->merge_operator_time_nanos, 0); + + delete db; +} + +TEST_F(PerfContextTest, CopyAndMove) { + // Assignment operator + { + get_perf_context()->Reset(); + get_perf_context()->EnablePerLevelPerfContext(); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5); + ASSERT_EQ( + 1, + (*(get_perf_context()->level_to_perf_context))[5].bloom_filter_useful); + PerfContext perf_context_assign; + perf_context_assign = *get_perf_context(); + ASSERT_EQ( + 1, + (*(perf_context_assign.level_to_perf_context))[5].bloom_filter_useful); + get_perf_context()->ClearPerLevelPerfContext(); + get_perf_context()->Reset(); + ASSERT_EQ( + 1, + (*(perf_context_assign.level_to_perf_context))[5].bloom_filter_useful); + perf_context_assign.ClearPerLevelPerfContext(); + perf_context_assign.Reset(); + } + // Copy constructor + { + get_perf_context()->Reset(); + get_perf_context()->EnablePerLevelPerfContext(); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5); + ASSERT_EQ( + 1, + (*(get_perf_context()->level_to_perf_context))[5].bloom_filter_useful); + PerfContext perf_context_copy(*get_perf_context()); + ASSERT_EQ( + 1, (*(perf_context_copy.level_to_perf_context))[5].bloom_filter_useful); + get_perf_context()->ClearPerLevelPerfContext(); + get_perf_context()->Reset(); + ASSERT_EQ( + 1, (*(perf_context_copy.level_to_perf_context))[5].bloom_filter_useful); + perf_context_copy.ClearPerLevelPerfContext(); + perf_context_copy.Reset(); + } + // Move constructor + { + get_perf_context()->Reset(); + get_perf_context()->EnablePerLevelPerfContext(); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5); + ASSERT_EQ( + 1, + (*(get_perf_context()->level_to_perf_context))[5].bloom_filter_useful); + PerfContext perf_context_move = std::move(*get_perf_context()); + ASSERT_EQ( + 1, (*(perf_context_move.level_to_perf_context))[5].bloom_filter_useful); + get_perf_context()->ClearPerLevelPerfContext(); + get_perf_context()->Reset(); + ASSERT_EQ( + 1, (*(perf_context_move.level_to_perf_context))[5].bloom_filter_useful); + perf_context_move.ClearPerLevelPerfContext(); + perf_context_move.Reset(); + } +} + +TEST_F(PerfContextTest, PerfContextDisableEnable) { + get_perf_context()->Reset(); + get_perf_context()->EnablePerLevelPerfContext(); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, 0); + get_perf_context()->DisablePerLevelPerfContext(); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5); + get_perf_context()->EnablePerLevelPerfContext(); + PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1, 0); + get_perf_context()->DisablePerLevelPerfContext(); + PerfContext perf_context_copy(*get_perf_context()); + ASSERT_EQ(1, (*(perf_context_copy.level_to_perf_context))[0] + .bloom_filter_full_positive); + // this was set when per level perf context is disabled, should not be copied + ASSERT_NE( + 1, (*(perf_context_copy.level_to_perf_context))[5].bloom_filter_useful); + ASSERT_EQ( + 1, (*(perf_context_copy.level_to_perf_context))[0].block_cache_hit_count); + perf_context_copy.ClearPerLevelPerfContext(); + perf_context_copy.Reset(); + get_perf_context()->ClearPerLevelPerfContext(); + get_perf_context()->Reset(); +} + +TEST_F(PerfContextTest, PerfContextByLevelGetSet) { + get_perf_context()->Reset(); + get_perf_context()->EnablePerLevelPerfContext(); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, 0); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 7); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 7); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1, 2); + PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1, 0); + PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 5, 2); + PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 2, 3); + PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 4, 1); + ASSERT_EQ( + 0, (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + ASSERT_EQ( + 1, (*(get_perf_context()->level_to_perf_context))[5].bloom_filter_useful); + ASSERT_EQ( + 2, (*(get_perf_context()->level_to_perf_context))[7].bloom_filter_useful); + ASSERT_EQ(1, (*(get_perf_context()->level_to_perf_context))[0] + .bloom_filter_full_positive); + ASSERT_EQ(1, (*(get_perf_context()->level_to_perf_context))[2] + .bloom_filter_full_true_positive); + ASSERT_EQ(1, (*(get_perf_context()->level_to_perf_context))[0] + .block_cache_hit_count); + ASSERT_EQ(5, (*(get_perf_context()->level_to_perf_context))[2] + .block_cache_hit_count); + ASSERT_EQ(2, (*(get_perf_context()->level_to_perf_context))[3] + .block_cache_miss_count); + ASSERT_EQ(4, (*(get_perf_context()->level_to_perf_context))[1] + .block_cache_miss_count); + std::string zero_excluded = get_perf_context()->ToString(true); + ASSERT_NE(std::string::npos, + zero_excluded.find("bloom_filter_useful = 1@level5, 2@level7")); + ASSERT_NE(std::string::npos, + zero_excluded.find("bloom_filter_full_positive = 1@level0")); + ASSERT_NE(std::string::npos, + zero_excluded.find("bloom_filter_full_true_positive = 1@level2")); + ASSERT_NE(std::string::npos, + zero_excluded.find("block_cache_hit_count = 1@level0, 5@level2")); + ASSERT_NE(std::string::npos, + zero_excluded.find("block_cache_miss_count = 4@level1, 2@level3")); +} + +TEST_F(PerfContextTest, CPUTimer) { + DestroyDB(kDbName, Options()); + auto db = OpenDb(); + WriteOptions write_options; + ReadOptions read_options; + SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); + + std::string max_str = "0"; + for (int i = 0; i < FLAGS_total_keys; ++i) { + std::string i_str = ToString(i); + std::string key = "k" + i_str; + std::string value = "v" + i_str; + max_str = max_str > i_str ? max_str : i_str; + + db->Put(write_options, key, value); + } + std::string last_key = "k" + max_str; + std::string last_value = "v" + max_str; + + { + // Get + get_perf_context()->Reset(); + std::string value; + ASSERT_OK(db->Get(read_options, "k0", &value)); + ASSERT_EQ(value, "v0"); + + if (FLAGS_verbose) { + std::cout << "Get CPU time nanos: " << get_perf_context()->get_cpu_nanos + << "ns\n"; + } + + // Iter + std::unique_ptr iter(db->NewIterator(read_options)); + + // Seek + get_perf_context()->Reset(); + iter->Seek(last_key); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(last_value, iter->value().ToString()); + + if (FLAGS_verbose) { + std::cout << "Iter Seek CPU time nanos: " + << get_perf_context()->iter_seek_cpu_nanos << "ns\n"; + } + + // SeekForPrev + get_perf_context()->Reset(); + iter->SeekForPrev(last_key); + ASSERT_TRUE(iter->Valid()); + + if (FLAGS_verbose) { + std::cout << "Iter SeekForPrev CPU time nanos: " + << get_perf_context()->iter_seek_cpu_nanos << "ns\n"; + } + + // SeekToLast + get_perf_context()->Reset(); + iter->SeekToLast(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(last_value, iter->value().ToString()); + + if (FLAGS_verbose) { + std::cout << "Iter SeekToLast CPU time nanos: " + << get_perf_context()->iter_seek_cpu_nanos << "ns\n"; + } + + // SeekToFirst + get_perf_context()->Reset(); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("v0", iter->value().ToString()); + + if (FLAGS_verbose) { + std::cout << "Iter SeekToFirst CPU time nanos: " + << get_perf_context()->iter_seek_cpu_nanos << "ns\n"; + } + + // Next + get_perf_context()->Reset(); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("v1", iter->value().ToString()); + + if (FLAGS_verbose) { + std::cout << "Iter Next CPU time nanos: " + << get_perf_context()->iter_next_cpu_nanos << "ns\n"; + } + + // Prev + get_perf_context()->Reset(); + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("v0", iter->value().ToString()); + + if (FLAGS_verbose) { + std::cout << "Iter Prev CPU time nanos: " + << get_perf_context()->iter_prev_cpu_nanos << "ns\n"; + } + + // monotonically increasing + get_perf_context()->Reset(); + auto count = get_perf_context()->iter_seek_cpu_nanos; + for (int i = 0; i < FLAGS_total_keys; ++i) { + iter->Seek("k" + ToString(i)); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("v" + ToString(i), iter->value().ToString()); + auto next_count = get_perf_context()->iter_seek_cpu_nanos; + ASSERT_GT(next_count, count); + count = next_count; + } + + // iterator creation/destruction; multiple iterators + { + std::unique_ptr iter2(db->NewIterator(read_options)); + ASSERT_EQ(count, get_perf_context()->iter_seek_cpu_nanos); + iter2->Seek(last_key); + ASSERT_TRUE(iter2->Valid()); + ASSERT_EQ(last_value, iter2->value().ToString()); + ASSERT_GT(get_perf_context()->iter_seek_cpu_nanos, count); + count = get_perf_context()->iter_seek_cpu_nanos; + } + ASSERT_EQ(count, get_perf_context()->iter_seek_cpu_nanos); + } +} +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + + for (int i = 1; i < argc; i++) { + int n; + char junk; + + if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) { + FLAGS_write_buffer_size = n; + } + + if (sscanf(argv[i], "--total_keys=%d%c", &n, &junk) == 1) { + FLAGS_total_keys = n; + } + + if (sscanf(argv[i], "--random_key=%d%c", &n, &junk) == 1 && + (n == 0 || n == 1)) { + FLAGS_random_key = n; + } + + if (sscanf(argv[i], "--use_set_based_memetable=%d%c", &n, &junk) == 1 && + (n == 0 || n == 1)) { + FLAGS_use_set_based_memetable = n; + } + + if (sscanf(argv[i], "--verbose=%d%c", &n, &junk) == 1 && + (n == 0 || n == 1)) { + FLAGS_verbose = n; + } + } + + if (FLAGS_verbose) { + std::cout << kDbName << "\n"; + } + + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/pinned_iterators_manager.h b/src/rocksdb/db/pinned_iterators_manager.h new file mode 100644 index 000000000..5e8ad51dd --- /dev/null +++ b/src/rocksdb/db/pinned_iterators_manager.h @@ -0,0 +1,87 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once +#include +#include +#include +#include + +#include "table/internal_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +// PinnedIteratorsManager will be notified whenever we need to pin an Iterator +// and it will be responsible for deleting pinned Iterators when they are +// not needed anymore. +class PinnedIteratorsManager : public Cleanable { + public: + PinnedIteratorsManager() : pinning_enabled(false) {} + ~PinnedIteratorsManager() { + if (pinning_enabled) { + ReleasePinnedData(); + } + } + + // Enable Iterators pinning + void StartPinning() { + assert(pinning_enabled == false); + pinning_enabled = true; + } + + // Is pinning enabled ? + bool PinningEnabled() { return pinning_enabled; } + + // Take ownership of iter and delete it when ReleasePinnedData() is called + void PinIterator(InternalIterator* iter, bool arena = false) { + if (arena) { + PinPtr(iter, &PinnedIteratorsManager::ReleaseArenaInternalIterator); + } else { + PinPtr(iter, &PinnedIteratorsManager::ReleaseInternalIterator); + } + } + + typedef void (*ReleaseFunction)(void* arg1); + void PinPtr(void* ptr, ReleaseFunction release_func) { + assert(pinning_enabled); + if (ptr == nullptr) { + return; + } + pinned_ptrs_.emplace_back(ptr, release_func); + } + + // Release pinned Iterators + inline void ReleasePinnedData() { + assert(pinning_enabled == true); + pinning_enabled = false; + + // Remove duplicate pointers + std::sort(pinned_ptrs_.begin(), pinned_ptrs_.end()); + auto unique_end = std::unique(pinned_ptrs_.begin(), pinned_ptrs_.end()); + + for (auto i = pinned_ptrs_.begin(); i != unique_end; ++i) { + void* ptr = i->first; + ReleaseFunction release_func = i->second; + release_func(ptr); + } + pinned_ptrs_.clear(); + // Also do cleanups from the base Cleanable + Cleanable::Reset(); + } + + private: + static void ReleaseInternalIterator(void* ptr) { + delete reinterpret_cast(ptr); + } + + static void ReleaseArenaInternalIterator(void* ptr) { + reinterpret_cast(ptr)->~InternalIterator(); + } + + bool pinning_enabled; + std::vector> pinned_ptrs_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/plain_table_db_test.cc b/src/rocksdb/db/plain_table_db_test.cc new file mode 100644 index 000000000..d80cc4f67 --- /dev/null +++ b/src/rocksdb/db/plain_table_db_test.cc @@ -0,0 +1,1375 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef ROCKSDB_LITE + +#include +#include + +#include "db/db_impl/db_impl.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "file/filename.h" +#include "logging/logging.h" +#include "rocksdb/cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "table/meta_blocks.h" +#include "table/plain/plain_table_bloom.h" +#include "table/plain/plain_table_factory.h" +#include "table/plain/plain_table_key_coding.h" +#include "table/plain/plain_table_reader.h" +#include "table/table_builder.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/hash.h" +#include "util/mutexlock.h" +#include "util/string_util.h" +#include "utilities/merge_operators.h" + +using std::unique_ptr; + +namespace ROCKSDB_NAMESPACE { +class PlainTableKeyDecoderTest : public testing::Test {}; + +TEST_F(PlainTableKeyDecoderTest, ReadNonMmap) { + std::string tmp; + Random rnd(301); + const uint32_t kLength = 2222; + Slice contents = test::RandomString(&rnd, kLength, &tmp); + test::StringSource* string_source = + new test::StringSource(contents, 0, false); + + std::unique_ptr file_reader( + test::GetRandomAccessFileReader(string_source)); + std::unique_ptr file_info( + new PlainTableReaderFileInfo(std::move(file_reader), EnvOptions(), + kLength)); + + { + PlainTableFileReader reader(file_info.get()); + + const uint32_t kReadSize = 77; + for (uint32_t pos = 0; pos < kLength; pos += kReadSize) { + uint32_t read_size = std::min(kLength - pos, kReadSize); + Slice out; + ASSERT_TRUE(reader.Read(pos, read_size, &out)); + ASSERT_EQ(0, out.compare(tmp.substr(pos, read_size))); + } + + ASSERT_LT(uint32_t(string_source->total_reads()), kLength / kReadSize / 2); + } + + std::vector>> reads = { + {{600, 30}, {590, 30}, {600, 20}, {600, 40}}, + {{800, 20}, {100, 20}, {500, 20}, {1500, 20}, {100, 20}, {80, 20}}, + {{1000, 20}, {500, 20}, {1000, 50}}, + {{1000, 20}, {500, 20}, {500, 20}}, + {{1000, 20}, {500, 20}, {200, 20}, {500, 20}}, + {{1000, 20}, {500, 20}, {200, 20}, {1000, 50}}, + {{600, 500}, {610, 20}, {100, 20}}, + {{500, 100}, {490, 100}, {550, 50}}, + }; + + std::vector num_file_reads = {2, 6, 2, 2, 4, 3, 2, 2}; + + for (size_t i = 0; i < reads.size(); i++) { + string_source->set_total_reads(0); + PlainTableFileReader reader(file_info.get()); + for (auto p : reads[i]) { + Slice out; + ASSERT_TRUE(reader.Read(p.first, p.second, &out)); + ASSERT_EQ(0, out.compare(tmp.substr(p.first, p.second))); + } + ASSERT_EQ(num_file_reads[i], string_source->total_reads()); + } +} + +class PlainTableDBTest : public testing::Test, + public testing::WithParamInterface { + protected: + private: + std::string dbname_; + Env* env_; + DB* db_; + + bool mmap_mode_; + Options last_options_; + + public: + PlainTableDBTest() : env_(Env::Default()) {} + + ~PlainTableDBTest() override { + delete db_; + EXPECT_OK(DestroyDB(dbname_, Options())); + } + + void SetUp() override { + mmap_mode_ = GetParam(); + dbname_ = test::PerThreadDBPath("plain_table_db_test"); + EXPECT_OK(DestroyDB(dbname_, Options())); + db_ = nullptr; + Reopen(); + } + + // Return the current option configuration. + Options CurrentOptions() { + Options options; + + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = 0; + plain_table_options.bloom_bits_per_key = 2; + plain_table_options.hash_table_ratio = 0.8; + plain_table_options.index_sparseness = 3; + plain_table_options.huge_page_tlb_size = 0; + plain_table_options.encoding_type = kPrefix; + plain_table_options.full_scan_mode = false; + plain_table_options.store_index_in_file = false; + + options.table_factory.reset(NewPlainTableFactory(plain_table_options)); + options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true)); + + options.prefix_extractor.reset(NewFixedPrefixTransform(8)); + options.allow_mmap_reads = mmap_mode_; + options.allow_concurrent_memtable_write = false; + options.unordered_write = false; + return options; + } + + DBImpl* dbfull() { + return reinterpret_cast(db_); + } + + void Reopen(Options* options = nullptr) { + ASSERT_OK(TryReopen(options)); + } + + void Close() { + delete db_; + db_ = nullptr; + } + + bool mmap_mode() const { return mmap_mode_; } + + void DestroyAndReopen(Options* options = nullptr) { + //Destroy using last options + Destroy(&last_options_); + ASSERT_OK(TryReopen(options)); + } + + void Destroy(Options* options) { + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, *options)); + } + + Status PureReopen(Options* options, DB** db) { + return DB::Open(*options, dbname_, db); + } + + Status ReopenForReadOnly(Options* options) { + delete db_; + db_ = nullptr; + return DB::OpenForReadOnly(*options, dbname_, &db_); + } + + Status TryReopen(Options* options = nullptr) { + delete db_; + db_ = nullptr; + Options opts; + if (options != nullptr) { + opts = *options; + } else { + opts = CurrentOptions(); + opts.create_if_missing = true; + } + last_options_ = opts; + + return DB::Open(opts, dbname_, &db_); + } + + Status Put(const Slice& k, const Slice& v) { + return db_->Put(WriteOptions(), k, v); + } + + Status Delete(const std::string& k) { + return db_->Delete(WriteOptions(), k); + } + + std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) { + ReadOptions options; + options.snapshot = snapshot; + std::string result; + Status s = db_->Get(options, k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + + int NumTableFilesAtLevel(int level) { + std::string property; + EXPECT_TRUE(db_->GetProperty( + "rocksdb.num-files-at-level" + NumberToString(level), &property)); + return atoi(property.c_str()); + } + + // Return spread of files per level + std::string FilesPerLevel() { + std::string result; + size_t last_non_zero_offset = 0; + for (int level = 0; level < db_->NumberLevels(); level++) { + int f = NumTableFilesAtLevel(level); + char buf[100]; + snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f); + result += buf; + if (f > 0) { + last_non_zero_offset = result.size(); + } + } + result.resize(last_non_zero_offset); + return result; + } + + std::string IterStatus(Iterator* iter) { + std::string result; + if (iter->Valid()) { + result = iter->key().ToString() + "->" + iter->value().ToString(); + } else { + result = "(invalid)"; + } + return result; + } +}; + +TEST_P(PlainTableDBTest, Empty) { + ASSERT_TRUE(dbfull() != nullptr); + ASSERT_EQ("NOT_FOUND", Get("0000000000000foo")); +} + +extern const uint64_t kPlainTableMagicNumber; + +class TestPlainTableReader : public PlainTableReader { + public: + TestPlainTableReader(const EnvOptions& env_options, + const InternalKeyComparator& icomparator, + EncodingType encoding_type, uint64_t file_size, + int bloom_bits_per_key, double hash_table_ratio, + size_t index_sparseness, + const TableProperties* table_properties, + std::unique_ptr&& file, + const ImmutableCFOptions& ioptions, + const SliceTransform* prefix_extractor, + bool* expect_bloom_not_match, bool store_index_in_file, + uint32_t column_family_id, + const std::string& column_family_name) + : PlainTableReader(ioptions, std::move(file), env_options, icomparator, + encoding_type, file_size, table_properties, + prefix_extractor), + expect_bloom_not_match_(expect_bloom_not_match) { + Status s = MmapDataIfNeeded(); + EXPECT_TRUE(s.ok()); + + s = PopulateIndex(const_cast(table_properties), + bloom_bits_per_key, hash_table_ratio, index_sparseness, + 2 * 1024 * 1024); + EXPECT_TRUE(s.ok()); + + TableProperties* props = const_cast(table_properties); + EXPECT_EQ(column_family_id, static_cast(props->column_family_id)); + EXPECT_EQ(column_family_name, props->column_family_name); + if (store_index_in_file) { + auto bloom_version_ptr = props->user_collected_properties.find( + PlainTablePropertyNames::kBloomVersion); + EXPECT_TRUE(bloom_version_ptr != props->user_collected_properties.end()); + EXPECT_EQ(bloom_version_ptr->second, std::string("1")); + if (ioptions.bloom_locality > 0) { + auto num_blocks_ptr = props->user_collected_properties.find( + PlainTablePropertyNames::kNumBloomBlocks); + EXPECT_TRUE(num_blocks_ptr != props->user_collected_properties.end()); + } + } + table_properties_.reset(props); + } + + ~TestPlainTableReader() override {} + + private: + bool MatchBloom(uint32_t hash) const override { + bool ret = PlainTableReader::MatchBloom(hash); + if (*expect_bloom_not_match_) { + EXPECT_TRUE(!ret); + } else { + EXPECT_TRUE(ret); + } + return ret; + } + bool* expect_bloom_not_match_; +}; + +extern const uint64_t kPlainTableMagicNumber; +class TestPlainTableFactory : public PlainTableFactory { + public: + explicit TestPlainTableFactory(bool* expect_bloom_not_match, + const PlainTableOptions& options, + uint32_t column_family_id, + std::string column_family_name) + : PlainTableFactory(options), + bloom_bits_per_key_(options.bloom_bits_per_key), + hash_table_ratio_(options.hash_table_ratio), + index_sparseness_(options.index_sparseness), + store_index_in_file_(options.store_index_in_file), + expect_bloom_not_match_(expect_bloom_not_match), + column_family_id_(column_family_id), + column_family_name_(std::move(column_family_name)) {} + + Status NewTableReader( + const TableReaderOptions& table_reader_options, + std::unique_ptr&& file, uint64_t file_size, + std::unique_ptr* table, + bool /*prefetch_index_and_filter_in_cache*/) const override { + TableProperties* props = nullptr; + auto s = + ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, + table_reader_options.ioptions, &props, + true /* compression_type_missing */); + EXPECT_TRUE(s.ok()); + + if (store_index_in_file_) { + BlockHandle bloom_block_handle; + s = FindMetaBlock(file.get(), file_size, kPlainTableMagicNumber, + table_reader_options.ioptions, + BloomBlockBuilder::kBloomBlock, &bloom_block_handle, + /* compression_type_missing */ true); + EXPECT_TRUE(s.ok()); + + BlockHandle index_block_handle; + s = FindMetaBlock(file.get(), file_size, kPlainTableMagicNumber, + table_reader_options.ioptions, + PlainTableIndexBuilder::kPlainTableIndexBlock, + &index_block_handle, /* compression_type_missing */ true); + EXPECT_TRUE(s.ok()); + } + + auto& user_props = props->user_collected_properties; + auto encoding_type_prop = + user_props.find(PlainTablePropertyNames::kEncodingType); + assert(encoding_type_prop != user_props.end()); + EncodingType encoding_type = static_cast( + DecodeFixed32(encoding_type_prop->second.c_str())); + + std::unique_ptr new_reader(new TestPlainTableReader( + table_reader_options.env_options, + table_reader_options.internal_comparator, encoding_type, file_size, + bloom_bits_per_key_, hash_table_ratio_, index_sparseness_, props, + std::move(file), table_reader_options.ioptions, + table_reader_options.prefix_extractor, expect_bloom_not_match_, + store_index_in_file_, column_family_id_, column_family_name_)); + + *table = std::move(new_reader); + return s; + } + + private: + int bloom_bits_per_key_; + double hash_table_ratio_; + size_t index_sparseness_; + bool store_index_in_file_; + bool* expect_bloom_not_match_; + const uint32_t column_family_id_; + const std::string column_family_name_; +}; + +TEST_P(PlainTableDBTest, BadOptions1) { + // Build with a prefix extractor + ASSERT_OK(Put("1000000000000foo", "v1")); + dbfull()->TEST_FlushMemTable(); + + // Bad attempt to re-open without a prefix extractor + Options options = CurrentOptions(); + options.prefix_extractor.reset(); + ASSERT_EQ( + "Invalid argument: Prefix extractor is missing when opening a PlainTable " + "built using a prefix extractor", + TryReopen(&options).ToString()); + + // Bad attempt to re-open with different prefix extractor + options.prefix_extractor.reset(NewFixedPrefixTransform(6)); + ASSERT_EQ( + "Invalid argument: Prefix extractor given doesn't match the one used to " + "build PlainTable", + TryReopen(&options).ToString()); + + // Correct prefix extractor + options.prefix_extractor.reset(NewFixedPrefixTransform(8)); + Reopen(&options); + ASSERT_EQ("v1", Get("1000000000000foo")); +} + +TEST_P(PlainTableDBTest, BadOptions2) { + Options options = CurrentOptions(); + options.prefix_extractor.reset(); + options.create_if_missing = true; + DestroyAndReopen(&options); + // Build without a prefix extractor + // (apparently works even if hash_table_ratio > 0) + ASSERT_OK(Put("1000000000000foo", "v1")); + dbfull()->TEST_FlushMemTable(); + + // Bad attempt to re-open with hash_table_ratio > 0 and no prefix extractor + Status s = TryReopen(&options); + ASSERT_EQ( + "Not implemented: PlainTable requires a prefix extractor enable prefix " + "hash mode.", + s.ToString()); + + // OK to open with hash_table_ratio == 0 and no prefix extractor + PlainTableOptions plain_table_options; + plain_table_options.hash_table_ratio = 0; + options.table_factory.reset(NewPlainTableFactory(plain_table_options)); + Reopen(&options); + ASSERT_EQ("v1", Get("1000000000000foo")); + + // OK to open newly with a prefix_extractor and hash table; builds index + // in memory. + options = CurrentOptions(); + Reopen(&options); + ASSERT_EQ("v1", Get("1000000000000foo")); +} + +TEST_P(PlainTableDBTest, Flush) { + for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024; + huge_page_tlb_size += 2 * 1024 * 1024) { + for (EncodingType encoding_type : {kPlain, kPrefix}) { + for (int bloom = -1; bloom <= 117; bloom += 117) { + const int bloom_bits = std::max(bloom, 0); + const bool full_scan_mode = bloom < 0; + for (int total_order = 0; total_order <= 1; total_order++) { + for (int store_index_in_file = 0; store_index_in_file <= 1; + ++store_index_in_file) { + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + if (total_order) { + options.prefix_extractor.reset(); + + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = 0; + plain_table_options.bloom_bits_per_key = bloom_bits; + plain_table_options.hash_table_ratio = 0; + plain_table_options.index_sparseness = 2; + plain_table_options.huge_page_tlb_size = huge_page_tlb_size; + plain_table_options.encoding_type = encoding_type; + plain_table_options.full_scan_mode = full_scan_mode; + plain_table_options.store_index_in_file = store_index_in_file; + + options.table_factory.reset( + NewPlainTableFactory(plain_table_options)); + } else { + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = 0; + plain_table_options.bloom_bits_per_key = bloom_bits; + plain_table_options.hash_table_ratio = 0.75; + plain_table_options.index_sparseness = 16; + plain_table_options.huge_page_tlb_size = huge_page_tlb_size; + plain_table_options.encoding_type = encoding_type; + plain_table_options.full_scan_mode = full_scan_mode; + plain_table_options.store_index_in_file = store_index_in_file; + + options.table_factory.reset( + NewPlainTableFactory(plain_table_options)); + } + DestroyAndReopen(&options); + uint64_t int_num; + ASSERT_TRUE(dbfull()->GetIntProperty( + "rocksdb.estimate-table-readers-mem", &int_num)); + ASSERT_EQ(int_num, 0U); + + ASSERT_OK(Put("1000000000000foo", "v1")); + ASSERT_OK(Put("0000000000000bar", "v2")); + ASSERT_OK(Put("1000000000000foo", "v3")); + dbfull()->TEST_FlushMemTable(); + + ASSERT_TRUE(dbfull()->GetIntProperty( + "rocksdb.estimate-table-readers-mem", &int_num)); + ASSERT_GT(int_num, 0U); + + TablePropertiesCollection ptc; + reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc); + ASSERT_EQ(1U, ptc.size()); + auto row = ptc.begin(); + auto tp = row->second; + + if (full_scan_mode) { + // Does not support Get/Seek + std::unique_ptr iter(dbfull()->NewIterator(ReadOptions())); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("0000000000000bar", iter->key().ToString()); + ASSERT_EQ("v2", iter->value().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000000foo", iter->key().ToString()); + ASSERT_EQ("v3", iter->value().ToString()); + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + ASSERT_TRUE(iter->status().ok()); + } else { + if (!store_index_in_file) { + ASSERT_EQ(total_order ? "4" : "12", + (tp->user_collected_properties) + .at("plain_table_hash_table_size")); + ASSERT_EQ("0", (tp->user_collected_properties) + .at("plain_table_sub_index_size")); + } else { + ASSERT_EQ("0", (tp->user_collected_properties) + .at("plain_table_hash_table_size")); + ASSERT_EQ("0", (tp->user_collected_properties) + .at("plain_table_sub_index_size")); + } + ASSERT_EQ("v3", Get("1000000000000foo")); + ASSERT_EQ("v2", Get("0000000000000bar")); + } + } + } + } + } + } +} + +TEST_P(PlainTableDBTest, Flush2) { + for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024; + huge_page_tlb_size += 2 * 1024 * 1024) { + for (EncodingType encoding_type : {kPlain, kPrefix}) { + for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) { + for (int total_order = 0; total_order <= 1; total_order++) { + for (int store_index_in_file = 0; store_index_in_file <= 1; + ++store_index_in_file) { + if (encoding_type == kPrefix && total_order) { + continue; + } + if (!bloom_bits && store_index_in_file) { + continue; + } + if (total_order && store_index_in_file) { + continue; + } + bool expect_bloom_not_match = false; + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + PlainTableOptions plain_table_options; + if (total_order) { + options.prefix_extractor = nullptr; + plain_table_options.hash_table_ratio = 0; + plain_table_options.index_sparseness = 2; + } else { + plain_table_options.hash_table_ratio = 0.75; + plain_table_options.index_sparseness = 16; + } + plain_table_options.user_key_len = kPlainTableVariableLength; + plain_table_options.bloom_bits_per_key = bloom_bits; + plain_table_options.huge_page_tlb_size = huge_page_tlb_size; + plain_table_options.encoding_type = encoding_type; + plain_table_options.store_index_in_file = store_index_in_file; + options.table_factory.reset(new TestPlainTableFactory( + &expect_bloom_not_match, plain_table_options, + 0 /* column_family_id */, kDefaultColumnFamilyName)); + + DestroyAndReopen(&options); + ASSERT_OK(Put("0000000000000bar", "b")); + ASSERT_OK(Put("1000000000000foo", "v1")); + dbfull()->TEST_FlushMemTable(); + + ASSERT_OK(Put("1000000000000foo", "v2")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v2", Get("1000000000000foo")); + + ASSERT_OK(Put("0000000000000eee", "v3")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v3", Get("0000000000000eee")); + + ASSERT_OK(Delete("0000000000000bar")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("NOT_FOUND", Get("0000000000000bar")); + + ASSERT_OK(Put("0000000000000eee", "v5")); + ASSERT_OK(Put("9000000000000eee", "v5")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v5", Get("0000000000000eee")); + + // Test Bloom Filter + if (bloom_bits > 0) { + // Neither key nor value should exist. + expect_bloom_not_match = true; + ASSERT_EQ("NOT_FOUND", Get("5_not00000000bar")); + // Key doesn't exist any more but prefix exists. + if (total_order) { + ASSERT_EQ("NOT_FOUND", Get("1000000000000not")); + ASSERT_EQ("NOT_FOUND", Get("0000000000000not")); + } + expect_bloom_not_match = false; + } + } + } + } + } + } +} + +TEST_P(PlainTableDBTest, Immortal) { + for (EncodingType encoding_type : {kPlain, kPrefix}) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.max_open_files = -1; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + PlainTableOptions plain_table_options; + plain_table_options.hash_table_ratio = 0.75; + plain_table_options.index_sparseness = 16; + plain_table_options.user_key_len = kPlainTableVariableLength; + plain_table_options.bloom_bits_per_key = 10; + plain_table_options.encoding_type = encoding_type; + options.table_factory.reset(NewPlainTableFactory(plain_table_options)); + + DestroyAndReopen(&options); + ASSERT_OK(Put("0000000000000bar", "b")); + ASSERT_OK(Put("1000000000000foo", "v1")); + dbfull()->TEST_FlushMemTable(); + + int copied = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "GetContext::SaveValue::PinSelf", [&](void* /*arg*/) { copied++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_EQ("b", Get("0000000000000bar")); + ASSERT_EQ("v1", Get("1000000000000foo")); + ASSERT_EQ(2, copied); + copied = 0; + + Close(); + ASSERT_OK(ReopenForReadOnly(&options)); + + ASSERT_EQ("b", Get("0000000000000bar")); + ASSERT_EQ("v1", Get("1000000000000foo")); + ASSERT_EQ("NOT_FOUND", Get("1000000000000bar")); + if (mmap_mode()) { + ASSERT_EQ(0, copied); + } else { + ASSERT_EQ(2, copied); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } +} + +TEST_P(PlainTableDBTest, Iterator) { + for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024; + huge_page_tlb_size += 2 * 1024 * 1024) { + for (EncodingType encoding_type : {kPlain, kPrefix}) { + for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) { + for (int total_order = 0; total_order <= 1; total_order++) { + if (encoding_type == kPrefix && total_order == 1) { + continue; + } + bool expect_bloom_not_match = false; + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + if (total_order) { + options.prefix_extractor = nullptr; + + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = 16; + plain_table_options.bloom_bits_per_key = bloom_bits; + plain_table_options.hash_table_ratio = 0; + plain_table_options.index_sparseness = 2; + plain_table_options.huge_page_tlb_size = huge_page_tlb_size; + plain_table_options.encoding_type = encoding_type; + + options.table_factory.reset(new TestPlainTableFactory( + &expect_bloom_not_match, plain_table_options, + 0 /* column_family_id */, kDefaultColumnFamilyName)); + } else { + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = 16; + plain_table_options.bloom_bits_per_key = bloom_bits; + plain_table_options.hash_table_ratio = 0.75; + plain_table_options.index_sparseness = 16; + plain_table_options.huge_page_tlb_size = huge_page_tlb_size; + plain_table_options.encoding_type = encoding_type; + + options.table_factory.reset(new TestPlainTableFactory( + &expect_bloom_not_match, plain_table_options, + 0 /* column_family_id */, kDefaultColumnFamilyName)); + } + DestroyAndReopen(&options); + + ASSERT_OK(Put("1000000000foo002", "v_2")); + ASSERT_OK(Put("0000000000000bar", "random")); + ASSERT_OK(Put("1000000000foo001", "v1")); + ASSERT_OK(Put("3000000000000bar", "bar_v")); + ASSERT_OK(Put("1000000000foo003", "v__3")); + ASSERT_OK(Put("1000000000foo004", "v__4")); + ASSERT_OK(Put("1000000000foo005", "v__5")); + ASSERT_OK(Put("1000000000foo007", "v__7")); + ASSERT_OK(Put("1000000000foo008", "v__8")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v1", Get("1000000000foo001")); + ASSERT_EQ("v__3", Get("1000000000foo003")); + Iterator* iter = dbfull()->NewIterator(ReadOptions()); + iter->Seek("1000000000foo000"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo001", iter->key().ToString()); + ASSERT_EQ("v1", iter->value().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo002", iter->key().ToString()); + ASSERT_EQ("v_2", iter->value().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo003", iter->key().ToString()); + ASSERT_EQ("v__3", iter->value().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo004", iter->key().ToString()); + ASSERT_EQ("v__4", iter->value().ToString()); + + iter->Seek("3000000000000bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("3000000000000bar", iter->key().ToString()); + ASSERT_EQ("bar_v", iter->value().ToString()); + + iter->Seek("1000000000foo000"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo001", iter->key().ToString()); + ASSERT_EQ("v1", iter->value().ToString()); + + iter->Seek("1000000000foo005"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo005", iter->key().ToString()); + ASSERT_EQ("v__5", iter->value().ToString()); + + iter->Seek("1000000000foo006"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo007", iter->key().ToString()); + ASSERT_EQ("v__7", iter->value().ToString()); + + iter->Seek("1000000000foo008"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo008", iter->key().ToString()); + ASSERT_EQ("v__8", iter->value().ToString()); + + if (total_order == 0) { + iter->Seek("1000000000foo009"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("3000000000000bar", iter->key().ToString()); + } + + // Test Bloom Filter + if (bloom_bits > 0) { + if (!total_order) { + // Neither key nor value should exist. + expect_bloom_not_match = true; + iter->Seek("2not000000000bar"); + ASSERT_TRUE(!iter->Valid()); + ASSERT_EQ("NOT_FOUND", Get("2not000000000bar")); + expect_bloom_not_match = false; + } else { + expect_bloom_not_match = true; + ASSERT_EQ("NOT_FOUND", Get("2not000000000bar")); + expect_bloom_not_match = false; + } + } + + delete iter; + } + } + } + } +} + +namespace { +std::string NthKey(size_t n, char filler) { + std::string rv(16, filler); + rv[0] = n % 10; + rv[1] = (n / 10) % 10; + rv[2] = (n / 100) % 10; + rv[3] = (n / 1000) % 10; + return rv; +} +} // anonymous namespace + +TEST_P(PlainTableDBTest, BloomSchema) { + Options options = CurrentOptions(); + options.create_if_missing = true; + for (int bloom_locality = 0; bloom_locality <= 1; bloom_locality++) { + options.bloom_locality = bloom_locality; + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = 16; + plain_table_options.bloom_bits_per_key = 3; // high FP rate for test + plain_table_options.hash_table_ratio = 0.75; + plain_table_options.index_sparseness = 16; + plain_table_options.huge_page_tlb_size = 0; + plain_table_options.encoding_type = kPlain; + + bool expect_bloom_not_match = false; + options.table_factory.reset(new TestPlainTableFactory( + &expect_bloom_not_match, plain_table_options, 0 /* column_family_id */, + kDefaultColumnFamilyName)); + DestroyAndReopen(&options); + + for (unsigned i = 0; i < 2345; ++i) { + ASSERT_OK(Put(NthKey(i, 'y'), "added")); + } + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("added", Get(NthKey(42, 'y'))); + + for (unsigned i = 0; i < 32; ++i) { + // Known pattern of Bloom filter false positives can detect schema change + // with high probability. Known FPs stuffed into bits: + uint32_t pattern; + if (!bloom_locality) { + pattern = 1785868347UL; + } else if (CACHE_LINE_SIZE == 64U) { + pattern = 2421694657UL; + } else if (CACHE_LINE_SIZE == 128U) { + pattern = 788710956UL; + } else { + ASSERT_EQ(CACHE_LINE_SIZE, 256U); + pattern = 163905UL; + } + bool expect_fp = pattern & (1UL << i); + // fprintf(stderr, "expect_fp@%u: %d\n", i, (int)expect_fp); + expect_bloom_not_match = !expect_fp; + ASSERT_EQ("NOT_FOUND", Get(NthKey(i, 'n'))); + } + } +} + +namespace { +std::string MakeLongKey(size_t length, char c) { + return std::string(length, c); +} +} // namespace + +TEST_P(PlainTableDBTest, IteratorLargeKeys) { + Options options = CurrentOptions(); + + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = 0; + plain_table_options.bloom_bits_per_key = 0; + plain_table_options.hash_table_ratio = 0; + + options.table_factory.reset(NewPlainTableFactory(plain_table_options)); + options.create_if_missing = true; + options.prefix_extractor.reset(); + DestroyAndReopen(&options); + + std::string key_list[] = { + MakeLongKey(30, '0'), + MakeLongKey(16, '1'), + MakeLongKey(32, '2'), + MakeLongKey(60, '3'), + MakeLongKey(90, '4'), + MakeLongKey(50, '5'), + MakeLongKey(26, '6') + }; + + for (size_t i = 0; i < 7; i++) { + ASSERT_OK(Put(key_list[i], ToString(i))); + } + + dbfull()->TEST_FlushMemTable(); + + Iterator* iter = dbfull()->NewIterator(ReadOptions()); + iter->Seek(key_list[0]); + + for (size_t i = 0; i < 7; i++) { + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(key_list[i], iter->key().ToString()); + ASSERT_EQ(ToString(i), iter->value().ToString()); + iter->Next(); + } + + ASSERT_TRUE(!iter->Valid()); + + delete iter; +} + +namespace { +std::string MakeLongKeyWithPrefix(size_t length, char c) { + return "00000000" + std::string(length - 8, c); +} +} // namespace + +TEST_P(PlainTableDBTest, IteratorLargeKeysWithPrefix) { + Options options = CurrentOptions(); + + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = 16; + plain_table_options.bloom_bits_per_key = 0; + plain_table_options.hash_table_ratio = 0.8; + plain_table_options.index_sparseness = 3; + plain_table_options.huge_page_tlb_size = 0; + plain_table_options.encoding_type = kPrefix; + + options.table_factory.reset(NewPlainTableFactory(plain_table_options)); + options.create_if_missing = true; + DestroyAndReopen(&options); + + std::string key_list[] = { + MakeLongKeyWithPrefix(30, '0'), MakeLongKeyWithPrefix(16, '1'), + MakeLongKeyWithPrefix(32, '2'), MakeLongKeyWithPrefix(60, '3'), + MakeLongKeyWithPrefix(90, '4'), MakeLongKeyWithPrefix(50, '5'), + MakeLongKeyWithPrefix(26, '6')}; + + for (size_t i = 0; i < 7; i++) { + ASSERT_OK(Put(key_list[i], ToString(i))); + } + + dbfull()->TEST_FlushMemTable(); + + Iterator* iter = dbfull()->NewIterator(ReadOptions()); + iter->Seek(key_list[0]); + + for (size_t i = 0; i < 7; i++) { + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(key_list[i], iter->key().ToString()); + ASSERT_EQ(ToString(i), iter->value().ToString()); + iter->Next(); + } + + ASSERT_TRUE(!iter->Valid()); + + delete iter; +} + +TEST_P(PlainTableDBTest, IteratorReverseSuffixComparator) { + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + test::SimpleSuffixReverseComparator comp; + options.comparator = ∁ + DestroyAndReopen(&options); + + ASSERT_OK(Put("1000000000foo002", "v_2")); + ASSERT_OK(Put("0000000000000bar", "random")); + ASSERT_OK(Put("1000000000foo001", "v1")); + ASSERT_OK(Put("3000000000000bar", "bar_v")); + ASSERT_OK(Put("1000000000foo003", "v__3")); + ASSERT_OK(Put("1000000000foo004", "v__4")); + ASSERT_OK(Put("1000000000foo005", "v__5")); + ASSERT_OK(Put("1000000000foo007", "v__7")); + ASSERT_OK(Put("1000000000foo008", "v__8")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v1", Get("1000000000foo001")); + ASSERT_EQ("v__3", Get("1000000000foo003")); + Iterator* iter = dbfull()->NewIterator(ReadOptions()); + iter->Seek("1000000000foo009"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo008", iter->key().ToString()); + ASSERT_EQ("v__8", iter->value().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo007", iter->key().ToString()); + ASSERT_EQ("v__7", iter->value().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo005", iter->key().ToString()); + ASSERT_EQ("v__5", iter->value().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo004", iter->key().ToString()); + ASSERT_EQ("v__4", iter->value().ToString()); + + iter->Seek("3000000000000bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("3000000000000bar", iter->key().ToString()); + ASSERT_EQ("bar_v", iter->value().ToString()); + + iter->Seek("1000000000foo005"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo005", iter->key().ToString()); + ASSERT_EQ("v__5", iter->value().ToString()); + + iter->Seek("1000000000foo006"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo005", iter->key().ToString()); + ASSERT_EQ("v__5", iter->value().ToString()); + + iter->Seek("1000000000foo008"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo008", iter->key().ToString()); + ASSERT_EQ("v__8", iter->value().ToString()); + + iter->Seek("1000000000foo000"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("3000000000000bar", iter->key().ToString()); + + delete iter; +} + +TEST_P(PlainTableDBTest, HashBucketConflict) { + for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024; + huge_page_tlb_size += 2 * 1024 * 1024) { + for (unsigned char i = 1; i <= 3; i++) { + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = 16; + plain_table_options.bloom_bits_per_key = 0; + plain_table_options.hash_table_ratio = 0; + plain_table_options.index_sparseness = 2 ^ i; + plain_table_options.huge_page_tlb_size = huge_page_tlb_size; + + options.table_factory.reset(NewPlainTableFactory(plain_table_options)); + + DestroyAndReopen(&options); + ASSERT_OK(Put("5000000000000fo0", "v1")); + ASSERT_OK(Put("5000000000000fo1", "v2")); + ASSERT_OK(Put("5000000000000fo2", "v")); + ASSERT_OK(Put("2000000000000fo0", "v3")); + ASSERT_OK(Put("2000000000000fo1", "v4")); + ASSERT_OK(Put("2000000000000fo2", "v")); + ASSERT_OK(Put("2000000000000fo3", "v")); + + dbfull()->TEST_FlushMemTable(); + + ASSERT_EQ("v1", Get("5000000000000fo0")); + ASSERT_EQ("v2", Get("5000000000000fo1")); + ASSERT_EQ("v3", Get("2000000000000fo0")); + ASSERT_EQ("v4", Get("2000000000000fo1")); + + ASSERT_EQ("NOT_FOUND", Get("5000000000000bar")); + ASSERT_EQ("NOT_FOUND", Get("2000000000000bar")); + ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8")); + ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8")); + + ReadOptions ro; + Iterator* iter = dbfull()->NewIterator(ro); + + iter->Seek("5000000000000fo0"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo0", iter->key().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo1", iter->key().ToString()); + + iter->Seek("5000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo1", iter->key().ToString()); + + iter->Seek("2000000000000fo0"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo0", iter->key().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo1", iter->key().ToString()); + + iter->Seek("2000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo1", iter->key().ToString()); + + iter->Seek("2000000000000bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo0", iter->key().ToString()); + + iter->Seek("5000000000000bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo0", iter->key().ToString()); + + iter->Seek("2000000000000fo8"); + ASSERT_TRUE(!iter->Valid() || + options.comparator->Compare(iter->key(), "20000001") > 0); + + iter->Seek("5000000000000fo8"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("1000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("3000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("8000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + delete iter; + } + } +} + +TEST_P(PlainTableDBTest, HashBucketConflictReverseSuffixComparator) { + for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024; + huge_page_tlb_size += 2 * 1024 * 1024) { + for (unsigned char i = 1; i <= 3; i++) { + Options options = CurrentOptions(); + options.create_if_missing = true; + test::SimpleSuffixReverseComparator comp; + options.comparator = ∁ + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = 16; + plain_table_options.bloom_bits_per_key = 0; + plain_table_options.hash_table_ratio = 0; + plain_table_options.index_sparseness = 2 ^ i; + plain_table_options.huge_page_tlb_size = huge_page_tlb_size; + + options.table_factory.reset(NewPlainTableFactory(plain_table_options)); + DestroyAndReopen(&options); + ASSERT_OK(Put("5000000000000fo0", "v1")); + ASSERT_OK(Put("5000000000000fo1", "v2")); + ASSERT_OK(Put("5000000000000fo2", "v")); + ASSERT_OK(Put("2000000000000fo0", "v3")); + ASSERT_OK(Put("2000000000000fo1", "v4")); + ASSERT_OK(Put("2000000000000fo2", "v")); + ASSERT_OK(Put("2000000000000fo3", "v")); + + dbfull()->TEST_FlushMemTable(); + + ASSERT_EQ("v1", Get("5000000000000fo0")); + ASSERT_EQ("v2", Get("5000000000000fo1")); + ASSERT_EQ("v3", Get("2000000000000fo0")); + ASSERT_EQ("v4", Get("2000000000000fo1")); + + ASSERT_EQ("NOT_FOUND", Get("5000000000000bar")); + ASSERT_EQ("NOT_FOUND", Get("2000000000000bar")); + ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8")); + ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8")); + + ReadOptions ro; + Iterator* iter = dbfull()->NewIterator(ro); + + iter->Seek("5000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo1", iter->key().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo0", iter->key().ToString()); + + iter->Seek("5000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo1", iter->key().ToString()); + + iter->Seek("2000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo1", iter->key().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo0", iter->key().ToString()); + + iter->Seek("2000000000000fo1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo1", iter->key().ToString()); + + iter->Seek("2000000000000var"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("2000000000000fo3", iter->key().ToString()); + + iter->Seek("5000000000000var"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo2", iter->key().ToString()); + + std::string seek_key = "2000000000000bar"; + iter->Seek(seek_key); + ASSERT_TRUE(!iter->Valid() || + options.prefix_extractor->Transform(iter->key()) != + options.prefix_extractor->Transform(seek_key)); + + iter->Seek("1000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("3000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("8000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + delete iter; + } + } +} + +TEST_P(PlainTableDBTest, NonExistingKeyToNonEmptyBucket) { + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = 16; + plain_table_options.bloom_bits_per_key = 0; + plain_table_options.hash_table_ratio = 0; + plain_table_options.index_sparseness = 5; + + options.table_factory.reset(NewPlainTableFactory(plain_table_options)); + DestroyAndReopen(&options); + ASSERT_OK(Put("5000000000000fo0", "v1")); + ASSERT_OK(Put("5000000000000fo1", "v2")); + ASSERT_OK(Put("5000000000000fo2", "v3")); + + dbfull()->TEST_FlushMemTable(); + + ASSERT_EQ("v1", Get("5000000000000fo0")); + ASSERT_EQ("v2", Get("5000000000000fo1")); + ASSERT_EQ("v3", Get("5000000000000fo2")); + + ASSERT_EQ("NOT_FOUND", Get("8000000000000bar")); + ASSERT_EQ("NOT_FOUND", Get("1000000000000bar")); + + Iterator* iter = dbfull()->NewIterator(ReadOptions()); + + iter->Seek("5000000000000bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("5000000000000fo0", iter->key().ToString()); + + iter->Seek("5000000000000fo8"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("1000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("8000000000000fo2"); + ASSERT_TRUE(!iter->Valid()); + + delete iter; +} + +static std::string Key(int i) { + char buf[100]; + snprintf(buf, sizeof(buf), "key_______%06d", i); + return std::string(buf); +} + +static std::string RandomString(Random* rnd, int len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; +} + +TEST_P(PlainTableDBTest, CompactionTrigger) { + Options options = CurrentOptions(); + options.write_buffer_size = 120 << 10; // 100KB + options.num_levels = 3; + options.level0_file_num_compaction_trigger = 3; + Reopen(&options); + + Random rnd(301); + + for (int num = 0; num < options.level0_file_num_compaction_trigger - 1; + num++) { + std::vector values; + // Write 120KB (10 values, each 12K) + for (int i = 0; i < 10; i++) { + values.push_back(RandomString(&rnd, 12000)); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Put(Key(999), "")); + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(NumTableFilesAtLevel(0), num + 1); + } + + //generate one more file in level-0, and should trigger level-0 compaction + std::vector values; + for (int i = 0; i < 12; i++) { + values.push_back(RandomString(&rnd, 10000)); + ASSERT_OK(Put(Key(i), values[i])); + } + ASSERT_OK(Put(Key(999), "")); + dbfull()->TEST_WaitForCompact(); + + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1), 1); +} + +TEST_P(PlainTableDBTest, AdaptiveTable) { + Options options = CurrentOptions(); + options.create_if_missing = true; + + options.table_factory.reset(NewPlainTableFactory()); + DestroyAndReopen(&options); + + ASSERT_OK(Put("1000000000000foo", "v1")); + ASSERT_OK(Put("0000000000000bar", "v2")); + ASSERT_OK(Put("1000000000000foo", "v3")); + dbfull()->TEST_FlushMemTable(); + + options.create_if_missing = false; + std::shared_ptr block_based_factory( + NewBlockBasedTableFactory()); + std::shared_ptr plain_table_factory( + NewPlainTableFactory()); + std::shared_ptr dummy_factory; + options.table_factory.reset(NewAdaptiveTableFactory( + block_based_factory, block_based_factory, plain_table_factory)); + Reopen(&options); + ASSERT_EQ("v3", Get("1000000000000foo")); + ASSERT_EQ("v2", Get("0000000000000bar")); + + ASSERT_OK(Put("2000000000000foo", "v4")); + ASSERT_OK(Put("3000000000000bar", "v5")); + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("v4", Get("2000000000000foo")); + ASSERT_EQ("v5", Get("3000000000000bar")); + + Reopen(&options); + ASSERT_EQ("v3", Get("1000000000000foo")); + ASSERT_EQ("v2", Get("0000000000000bar")); + ASSERT_EQ("v4", Get("2000000000000foo")); + ASSERT_EQ("v5", Get("3000000000000bar")); + + options.paranoid_checks = false; + options.table_factory.reset(NewBlockBasedTableFactory()); + Reopen(&options); + ASSERT_NE("v3", Get("1000000000000foo")); + + options.paranoid_checks = false; + options.table_factory.reset(NewPlainTableFactory()); + Reopen(&options); + ASSERT_NE("v5", Get("3000000000000bar")); +} + +INSTANTIATE_TEST_CASE_P(PlainTableDBTest, PlainTableDBTest, ::testing::Bool()); + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "SKIPPED as plain table is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/pre_release_callback.h b/src/rocksdb/db/pre_release_callback.h new file mode 100644 index 000000000..b74be9537 --- /dev/null +++ b/src/rocksdb/db/pre_release_callback.h @@ -0,0 +1,38 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class DB; + +class PreReleaseCallback { + public: + virtual ~PreReleaseCallback() {} + + // Will be called while on the write thread after the write to the WAL and + // before the write to memtable. This is useful if any operation needs to be + // done before the write gets visible to the readers, or if we want to reduce + // the overhead of locking by updating something sequentially while we are on + // the write thread. If the callback fails, this function returns a non-OK + // status, the sequence number will not be released, and same status will be + // propagated to all the writers in the write group. + // seq is the sequence number that is used for this write and will be + // released. + // is_mem_disabled is currently used for debugging purposes to assert that + // the callback is done from the right write queue. + // If non-zero, log_number indicates the WAL log to which we wrote. + // index >= 0 specifies the order of callback in the same write thread. + // total > index specifies the total number of callbacks in the same write + // thread. Together with index, could be used to reduce the redundant + // operations among the callbacks. + virtual Status Callback(SequenceNumber seq, bool is_mem_disabled, + uint64_t log_number, size_t index, size_t total) = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/prefix_test.cc b/src/rocksdb/db/prefix_test.cc new file mode 100644 index 000000000..c61ec2a1e --- /dev/null +++ b/src/rocksdb/db/prefix_test.cc @@ -0,0 +1,895 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#ifndef GFLAGS +#include +int main() { + fprintf(stderr, "Please install gflags to run this test... Skipping...\n"); + return 0; +} +#else + +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "monitoring/histogram.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "test_util/testharness.h" +#include "util/coding.h" +#include "util/gflags_compat.h" +#include "util/random.h" +#include "util/stop_watch.h" +#include "util/string_util.h" +#include "utilities/merge_operators.h" + +using GFLAGS_NAMESPACE::ParseCommandLineFlags; + +DEFINE_bool(trigger_deadlock, false, + "issue delete in range scan to trigger PrefixHashMap deadlock"); +DEFINE_int32(bucket_count, 100000, "number of buckets"); +DEFINE_uint64(num_locks, 10001, "number of locks"); +DEFINE_bool(random_prefix, false, "randomize prefix"); +DEFINE_uint64(total_prefixes, 100000, "total number of prefixes"); +DEFINE_uint64(items_per_prefix, 1, "total number of values per prefix"); +DEFINE_int64(write_buffer_size, 33554432, ""); +DEFINE_int32(max_write_buffer_number, 2, ""); +DEFINE_int32(min_write_buffer_number_to_merge, 1, ""); +DEFINE_int32(skiplist_height, 4, ""); +DEFINE_double(memtable_prefix_bloom_size_ratio, 0.1, ""); +DEFINE_int32(memtable_huge_page_size, 2 * 1024 * 1024, ""); +DEFINE_int32(value_size, 40, ""); +DEFINE_bool(enable_print, false, "Print options generated to console."); + +// Path to the database on file system +const std::string kDbName = + ROCKSDB_NAMESPACE::test::PerThreadDBPath("prefix_test"); + +namespace ROCKSDB_NAMESPACE { + +struct TestKey { + uint64_t prefix; + uint64_t sorted; + + TestKey(uint64_t _prefix, uint64_t _sorted) + : prefix(_prefix), sorted(_sorted) {} +}; + +// return a slice backed by test_key +inline Slice TestKeyToSlice(std::string &s, const TestKey& test_key) { + s.clear(); + PutFixed64(&s, test_key.prefix); + PutFixed64(&s, test_key.sorted); + return Slice(s.c_str(), s.size()); +} + +inline const TestKey SliceToTestKey(const Slice& slice) { + return TestKey(DecodeFixed64(slice.data()), + DecodeFixed64(slice.data() + 8)); +} + +class TestKeyComparator : public Comparator { + public: + + // Compare needs to be aware of the possibility of a and/or b is + // prefix only + int Compare(const Slice& a, const Slice& b) const override { + const TestKey kkey_a = SliceToTestKey(a); + const TestKey kkey_b = SliceToTestKey(b); + const TestKey *key_a = &kkey_a; + const TestKey *key_b = &kkey_b; + if (key_a->prefix != key_b->prefix) { + if (key_a->prefix < key_b->prefix) return -1; + if (key_a->prefix > key_b->prefix) return 1; + } else { + EXPECT_TRUE(key_a->prefix == key_b->prefix); + // note, both a and b could be prefix only + if (a.size() != b.size()) { + // one of them is prefix + EXPECT_TRUE( + (a.size() == sizeof(uint64_t) && b.size() == sizeof(TestKey)) || + (b.size() == sizeof(uint64_t) && a.size() == sizeof(TestKey))); + if (a.size() < b.size()) return -1; + if (a.size() > b.size()) return 1; + } else { + // both a and b are prefix + if (a.size() == sizeof(uint64_t)) { + return 0; + } + + // both a and b are whole key + EXPECT_TRUE(a.size() == sizeof(TestKey) && b.size() == sizeof(TestKey)); + if (key_a->sorted < key_b->sorted) return -1; + if (key_a->sorted > key_b->sorted) return 1; + if (key_a->sorted == key_b->sorted) return 0; + } + } + return 0; + } + + bool operator()(const TestKey& a, const TestKey& b) const { + std::string sa, sb; + return Compare(TestKeyToSlice(sa, a), TestKeyToSlice(sb, b)) < 0; + } + + const char* Name() const override { return "TestKeyComparator"; } + + void FindShortestSeparator(std::string* /*start*/, + const Slice& /*limit*/) const override {} + + void FindShortSuccessor(std::string* /*key*/) const override {} +}; + +namespace { +void PutKey(DB* db, WriteOptions write_options, uint64_t prefix, + uint64_t suffix, const Slice& value) { + TestKey test_key(prefix, suffix); + std::string s; + Slice key = TestKeyToSlice(s, test_key); + ASSERT_OK(db->Put(write_options, key, value)); +} + +void PutKey(DB* db, WriteOptions write_options, const TestKey& test_key, + const Slice& value) { + std::string s; + Slice key = TestKeyToSlice(s, test_key); + ASSERT_OK(db->Put(write_options, key, value)); +} + +void MergeKey(DB* db, WriteOptions write_options, const TestKey& test_key, + const Slice& value) { + std::string s; + Slice key = TestKeyToSlice(s, test_key); + ASSERT_OK(db->Merge(write_options, key, value)); +} + +void DeleteKey(DB* db, WriteOptions write_options, const TestKey& test_key) { + std::string s; + Slice key = TestKeyToSlice(s, test_key); + ASSERT_OK(db->Delete(write_options, key)); +} + +void SeekIterator(Iterator* iter, uint64_t prefix, uint64_t suffix) { + TestKey test_key(prefix, suffix); + std::string s; + Slice key = TestKeyToSlice(s, test_key); + iter->Seek(key); +} + +const std::string kNotFoundResult = "NOT_FOUND"; + +std::string Get(DB* db, const ReadOptions& read_options, uint64_t prefix, + uint64_t suffix) { + TestKey test_key(prefix, suffix); + std::string s2; + Slice key = TestKeyToSlice(s2, test_key); + + std::string result; + Status s = db->Get(read_options, key, &result); + if (s.IsNotFound()) { + result = kNotFoundResult; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; +} + +class SamePrefixTransform : public SliceTransform { + private: + const Slice prefix_; + std::string name_; + + public: + explicit SamePrefixTransform(const Slice& prefix) + : prefix_(prefix), name_("rocksdb.SamePrefix." + prefix.ToString()) {} + + const char* Name() const override { return name_.c_str(); } + + Slice Transform(const Slice& src) const override { + assert(InDomain(src)); + return prefix_; + } + + bool InDomain(const Slice& src) const override { + if (src.size() >= prefix_.size()) { + return Slice(src.data(), prefix_.size()) == prefix_; + } + return false; + } + + bool InRange(const Slice& dst) const override { return dst == prefix_; } + + bool FullLengthEnabled(size_t* /*len*/) const override { return false; } +}; + +} // namespace + +class PrefixTest : public testing::Test { + public: + std::shared_ptr OpenDb() { + DB* db; + + options.create_if_missing = true; + options.write_buffer_size = FLAGS_write_buffer_size; + options.max_write_buffer_number = FLAGS_max_write_buffer_number; + options.min_write_buffer_number_to_merge = + FLAGS_min_write_buffer_number_to_merge; + + options.memtable_prefix_bloom_size_ratio = + FLAGS_memtable_prefix_bloom_size_ratio; + options.memtable_huge_page_size = FLAGS_memtable_huge_page_size; + + options.prefix_extractor.reset(NewFixedPrefixTransform(8)); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options.allow_concurrent_memtable_write = false; + + Status s = DB::Open(options, kDbName, &db); + EXPECT_OK(s); + return std::shared_ptr(db); + } + + void FirstOption() { + option_config_ = kBegin; + } + + bool NextOptions(int bucket_count) { + // skip some options + option_config_++; + if (option_config_ < kEnd) { + options.prefix_extractor.reset(NewFixedPrefixTransform(8)); + switch(option_config_) { + case kHashSkipList: + options.memtable_factory.reset( + NewHashSkipListRepFactory(bucket_count, FLAGS_skiplist_height)); + return true; + case kHashLinkList: + options.memtable_factory.reset( + NewHashLinkListRepFactory(bucket_count)); + return true; + case kHashLinkListHugePageTlb: + options.memtable_factory.reset( + NewHashLinkListRepFactory(bucket_count, 2 * 1024 * 1024)); + return true; + case kHashLinkListTriggerSkipList: + options.memtable_factory.reset( + NewHashLinkListRepFactory(bucket_count, 0, 3)); + return true; + default: + return false; + } + } + return false; + } + + PrefixTest() : option_config_(kBegin) { + options.comparator = new TestKeyComparator(); + } + ~PrefixTest() override { delete options.comparator; } + + protected: + enum OptionConfig { + kBegin, + kHashSkipList, + kHashLinkList, + kHashLinkListHugePageTlb, + kHashLinkListTriggerSkipList, + kEnd + }; + int option_config_; + Options options; +}; + +TEST(SamePrefixTest, InDomainTest) { + DB* db; + Options options; + options.create_if_missing = true; + options.prefix_extractor.reset(new SamePrefixTransform("HHKB")); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + WriteOptions write_options; + ReadOptions read_options; + { + ASSERT_OK(DestroyDB(kDbName, Options())); + ASSERT_OK(DB::Open(options, kDbName, &db)); + ASSERT_OK(db->Put(write_options, "HHKB pro2", "Mar 24, 2006")); + ASSERT_OK(db->Put(write_options, "HHKB pro2 Type-S", "June 29, 2011")); + ASSERT_OK(db->Put(write_options, "Realforce 87u", "idk")); + db->Flush(FlushOptions()); + std::string result; + auto db_iter = db->NewIterator(ReadOptions()); + + db_iter->Seek("Realforce 87u"); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_OK(db_iter->status()); + ASSERT_EQ(db_iter->key(), "Realforce 87u"); + ASSERT_EQ(db_iter->value(), "idk"); + + delete db_iter; + delete db; + ASSERT_OK(DestroyDB(kDbName, Options())); + } + + { + ASSERT_OK(DB::Open(options, kDbName, &db)); + ASSERT_OK(db->Put(write_options, "pikachu", "1")); + ASSERT_OK(db->Put(write_options, "Meowth", "1")); + ASSERT_OK(db->Put(write_options, "Mewtwo", "idk")); + db->Flush(FlushOptions()); + std::string result; + auto db_iter = db->NewIterator(ReadOptions()); + + db_iter->Seek("Mewtwo"); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_OK(db_iter->status()); + delete db_iter; + delete db; + ASSERT_OK(DestroyDB(kDbName, Options())); + } +} + +TEST_F(PrefixTest, TestResult) { + for (int num_buckets = 1; num_buckets <= 2; num_buckets++) { + FirstOption(); + while (NextOptions(num_buckets)) { + std::cout << "*** Mem table: " << options.memtable_factory->Name() + << " number of buckets: " << num_buckets + << std::endl; + DestroyDB(kDbName, Options()); + auto db = OpenDb(); + WriteOptions write_options; + ReadOptions read_options; + + // 1. Insert one row. + Slice v16("v16"); + PutKey(db.get(), write_options, 1, 6, v16); + std::unique_ptr iter(db->NewIterator(read_options)); + SeekIterator(iter.get(), 1, 6); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v16 == iter->value()); + SeekIterator(iter.get(), 1, 5); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v16 == iter->value()); + SeekIterator(iter.get(), 1, 5); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v16 == iter->value()); + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + + SeekIterator(iter.get(), 2, 0); + ASSERT_TRUE(!iter->Valid()); + + ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6)); + ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 1, 5)); + ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 1, 7)); + ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 0, 6)); + ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 2, 6)); + + // 2. Insert an entry for the same prefix as the last entry in the bucket. + Slice v17("v17"); + PutKey(db.get(), write_options, 1, 7, v17); + iter.reset(db->NewIterator(read_options)); + SeekIterator(iter.get(), 1, 7); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v17 == iter->value()); + + SeekIterator(iter.get(), 1, 6); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v16 == iter->value()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v17 == iter->value()); + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + + SeekIterator(iter.get(), 2, 0); + ASSERT_TRUE(!iter->Valid()); + + // 3. Insert an entry for the same prefix as the head of the bucket. + Slice v15("v15"); + PutKey(db.get(), write_options, 1, 5, v15); + iter.reset(db->NewIterator(read_options)); + + SeekIterator(iter.get(), 1, 7); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v17 == iter->value()); + + SeekIterator(iter.get(), 1, 5); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v15 == iter->value()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v16 == iter->value()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v17 == iter->value()); + + SeekIterator(iter.get(), 1, 5); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v15 == iter->value()); + + ASSERT_EQ(v15.ToString(), Get(db.get(), read_options, 1, 5)); + ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6)); + ASSERT_EQ(v17.ToString(), Get(db.get(), read_options, 1, 7)); + + // 4. Insert an entry with a larger prefix + Slice v22("v22"); + PutKey(db.get(), write_options, 2, 2, v22); + iter.reset(db->NewIterator(read_options)); + + SeekIterator(iter.get(), 2, 2); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v22 == iter->value()); + SeekIterator(iter.get(), 2, 0); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v22 == iter->value()); + + SeekIterator(iter.get(), 1, 5); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v15 == iter->value()); + + SeekIterator(iter.get(), 1, 7); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v17 == iter->value()); + + // 5. Insert an entry with a smaller prefix + Slice v02("v02"); + PutKey(db.get(), write_options, 0, 2, v02); + iter.reset(db->NewIterator(read_options)); + + SeekIterator(iter.get(), 0, 2); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v02 == iter->value()); + SeekIterator(iter.get(), 0, 0); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v02 == iter->value()); + + SeekIterator(iter.get(), 2, 0); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v22 == iter->value()); + + SeekIterator(iter.get(), 1, 5); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v15 == iter->value()); + + SeekIterator(iter.get(), 1, 7); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v17 == iter->value()); + + // 6. Insert to the beginning and the end of the first prefix + Slice v13("v13"); + Slice v18("v18"); + PutKey(db.get(), write_options, 1, 3, v13); + PutKey(db.get(), write_options, 1, 8, v18); + iter.reset(db->NewIterator(read_options)); + SeekIterator(iter.get(), 1, 7); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v17 == iter->value()); + + SeekIterator(iter.get(), 1, 3); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v13 == iter->value()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v15 == iter->value()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v16 == iter->value()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v17 == iter->value()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v18 == iter->value()); + + SeekIterator(iter.get(), 0, 0); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v02 == iter->value()); + + SeekIterator(iter.get(), 2, 0); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v22 == iter->value()); + + ASSERT_EQ(v22.ToString(), Get(db.get(), read_options, 2, 2)); + ASSERT_EQ(v02.ToString(), Get(db.get(), read_options, 0, 2)); + ASSERT_EQ(v13.ToString(), Get(db.get(), read_options, 1, 3)); + ASSERT_EQ(v15.ToString(), Get(db.get(), read_options, 1, 5)); + ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6)); + ASSERT_EQ(v17.ToString(), Get(db.get(), read_options, 1, 7)); + ASSERT_EQ(v18.ToString(), Get(db.get(), read_options, 1, 8)); + } + } +} + +// Show results in prefix +TEST_F(PrefixTest, PrefixValid) { + for (int num_buckets = 1; num_buckets <= 2; num_buckets++) { + FirstOption(); + while (NextOptions(num_buckets)) { + std::cout << "*** Mem table: " << options.memtable_factory->Name() + << " number of buckets: " << num_buckets << std::endl; + DestroyDB(kDbName, Options()); + auto db = OpenDb(); + WriteOptions write_options; + ReadOptions read_options; + + // Insert keys with common prefix and one key with different + Slice v16("v16"); + Slice v17("v17"); + Slice v18("v18"); + Slice v19("v19"); + PutKey(db.get(), write_options, 12345, 6, v16); + PutKey(db.get(), write_options, 12345, 7, v17); + PutKey(db.get(), write_options, 12345, 8, v18); + PutKey(db.get(), write_options, 12345, 9, v19); + PutKey(db.get(), write_options, 12346, 8, v16); + db->Flush(FlushOptions()); + TestKey test_key(12346, 8); + std::string s; + db->Delete(write_options, TestKeyToSlice(s, test_key)); + db->Flush(FlushOptions()); + read_options.prefix_same_as_start = true; + std::unique_ptr iter(db->NewIterator(read_options)); + SeekIterator(iter.get(), 12345, 6); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v16 == iter->value()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v17 == iter->value()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v18 == iter->value()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_TRUE(v19 == iter->value()); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 12346, 8)); + + // Verify seeking past the prefix won't return a result. + SeekIterator(iter.get(), 12345, 10); + ASSERT_TRUE(!iter->Valid()); + } + } +} + +TEST_F(PrefixTest, DynamicPrefixIterator) { + while (NextOptions(FLAGS_bucket_count)) { + std::cout << "*** Mem table: " << options.memtable_factory->Name() + << std::endl; + DestroyDB(kDbName, Options()); + auto db = OpenDb(); + WriteOptions write_options; + ReadOptions read_options; + + std::vector prefixes; + for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) { + prefixes.push_back(i); + } + + if (FLAGS_random_prefix) { + std::random_shuffle(prefixes.begin(), prefixes.end()); + } + + HistogramImpl hist_put_time; + HistogramImpl hist_put_comparison; + + // insert x random prefix, each with y continuous element. + for (auto prefix : prefixes) { + for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) { + TestKey test_key(prefix, sorted); + + std::string s; + Slice key = TestKeyToSlice(s, test_key); + std::string value(FLAGS_value_size, 0); + + get_perf_context()->Reset(); + StopWatchNano timer(Env::Default(), true); + ASSERT_OK(db->Put(write_options, key, value)); + hist_put_time.Add(timer.ElapsedNanos()); + hist_put_comparison.Add(get_perf_context()->user_key_comparison_count); + } + } + + std::cout << "Put key comparison: \n" << hist_put_comparison.ToString() + << "Put time: \n" << hist_put_time.ToString(); + + // test seek existing keys + HistogramImpl hist_seek_time; + HistogramImpl hist_seek_comparison; + + std::unique_ptr iter(db->NewIterator(read_options)); + + for (auto prefix : prefixes) { + TestKey test_key(prefix, FLAGS_items_per_prefix / 2); + std::string s; + Slice key = TestKeyToSlice(s, test_key); + std::string value = "v" + ToString(0); + + get_perf_context()->Reset(); + StopWatchNano timer(Env::Default(), true); + auto key_prefix = options.prefix_extractor->Transform(key); + uint64_t total_keys = 0; + for (iter->Seek(key); + iter->Valid() && iter->key().starts_with(key_prefix); + iter->Next()) { + if (FLAGS_trigger_deadlock) { + std::cout << "Behold the deadlock!\n"; + db->Delete(write_options, iter->key()); + } + total_keys++; + } + hist_seek_time.Add(timer.ElapsedNanos()); + hist_seek_comparison.Add(get_perf_context()->user_key_comparison_count); + ASSERT_EQ(total_keys, FLAGS_items_per_prefix - FLAGS_items_per_prefix/2); + } + + std::cout << "Seek key comparison: \n" + << hist_seek_comparison.ToString() + << "Seek time: \n" + << hist_seek_time.ToString(); + + // test non-existing keys + HistogramImpl hist_no_seek_time; + HistogramImpl hist_no_seek_comparison; + + for (auto prefix = FLAGS_total_prefixes; + prefix < FLAGS_total_prefixes + 10000; + prefix++) { + TestKey test_key(prefix, 0); + std::string s; + Slice key = TestKeyToSlice(s, test_key); + + get_perf_context()->Reset(); + StopWatchNano timer(Env::Default(), true); + iter->Seek(key); + hist_no_seek_time.Add(timer.ElapsedNanos()); + hist_no_seek_comparison.Add(get_perf_context()->user_key_comparison_count); + ASSERT_TRUE(!iter->Valid()); + } + + std::cout << "non-existing Seek key comparison: \n" + << hist_no_seek_comparison.ToString() + << "non-existing Seek time: \n" + << hist_no_seek_time.ToString(); + } +} + +TEST_F(PrefixTest, PrefixSeekModePrev) { + // Only for SkipListFactory + options.memtable_factory.reset(new SkipListFactory); + options.merge_operator = MergeOperators::CreatePutOperator(); + options.write_buffer_size = 1024 * 1024; + Random rnd(1); + for (size_t m = 1; m < 100; m++) { + std::cout << "[" + std::to_string(m) + "]" + "*** Mem table: " + << options.memtable_factory->Name() << std::endl; + DestroyDB(kDbName, Options()); + auto db = OpenDb(); + WriteOptions write_options; + ReadOptions read_options; + std::map entry_maps[3], whole_map; + for (uint64_t i = 0; i < 10; i++) { + int div = i % 3 + 1; + for (uint64_t j = 0; j < 10; j++) { + whole_map[TestKey(i, j)] = entry_maps[rnd.Uniform(div)][TestKey(i, j)] = + 'v' + std::to_string(i) + std::to_string(j); + } + } + + std::map type_map; + for (size_t i = 0; i < 3; i++) { + for (auto& kv : entry_maps[i]) { + if (rnd.OneIn(3)) { + PutKey(db.get(), write_options, kv.first, kv.second); + type_map[kv.first] = "value"; + } else { + MergeKey(db.get(), write_options, kv.first, kv.second); + type_map[kv.first] = "merge"; + } + } + if (i < 2) { + db->Flush(FlushOptions()); + } + } + + for (size_t i = 0; i < 2; i++) { + for (auto& kv : entry_maps[i]) { + if (rnd.OneIn(10)) { + whole_map.erase(kv.first); + DeleteKey(db.get(), write_options, kv.first); + entry_maps[2][kv.first] = "delete"; + } + } + } + + if (FLAGS_enable_print) { + for (size_t i = 0; i < 3; i++) { + for (auto& kv : entry_maps[i]) { + std::cout << "[" << i << "]" << kv.first.prefix << kv.first.sorted + << " " << kv.second + " " + type_map[kv.first] << std::endl; + } + } + } + + std::unique_ptr iter(db->NewIterator(read_options)); + for (uint64_t prefix = 0; prefix < 10; prefix++) { + uint64_t start_suffix = rnd.Uniform(9); + SeekIterator(iter.get(), prefix, start_suffix); + auto it = whole_map.find(TestKey(prefix, start_suffix)); + if (it == whole_map.end()) { + continue; + } + ASSERT_NE(it, whole_map.end()); + ASSERT_TRUE(iter->Valid()); + if (FLAGS_enable_print) { + std::cout << "round " << prefix + << " iter: " << SliceToTestKey(iter->key()).prefix + << SliceToTestKey(iter->key()).sorted + << " | map: " << it->first.prefix << it->first.sorted << " | " + << iter->value().ToString() << " " << it->second << std::endl; + } + ASSERT_EQ(iter->value(), it->second); + uint64_t stored_prefix = prefix; + for (size_t k = 0; k < 9; k++) { + if (rnd.OneIn(2) || it == whole_map.begin()) { + iter->Next(); + ++it; + if (FLAGS_enable_print) { + std::cout << "Next >> "; + } + } else { + iter->Prev(); + it--; + if (FLAGS_enable_print) { + std::cout << "Prev >> "; + } + } + if (!iter->Valid() || + SliceToTestKey(iter->key()).prefix != stored_prefix) { + break; + } + stored_prefix = SliceToTestKey(iter->key()).prefix; + ASSERT_TRUE(iter->Valid()); + ASSERT_NE(it, whole_map.end()); + ASSERT_EQ(iter->value(), it->second); + if (FLAGS_enable_print) { + std::cout << "iter: " << SliceToTestKey(iter->key()).prefix + << SliceToTestKey(iter->key()).sorted + << " | map: " << it->first.prefix << it->first.sorted + << " | " << iter->value().ToString() << " " << it->second + << std::endl; + } + } + } + } +} + +TEST_F(PrefixTest, PrefixSeekModePrev2) { + // Only for SkipListFactory + // test the case + // iter1 iter2 + // | prefix | suffix | | prefix | suffix | + // | 1 | 1 | | 1 | 2 | + // | 1 | 3 | | 1 | 4 | + // | 2 | 1 | | 3 | 3 | + // | 2 | 2 | | 3 | 4 | + // after seek(15), iter1 will be at 21 and iter2 will be 33. + // Then if call Prev() in prefix mode where SeekForPrev(21) gets called, + // iter2 should turn to invalid state because of bloom filter. + options.memtable_factory.reset(new SkipListFactory); + options.write_buffer_size = 1024 * 1024; + std::string v13("v13"); + DestroyDB(kDbName, Options()); + auto db = OpenDb(); + WriteOptions write_options; + ReadOptions read_options; + PutKey(db.get(), write_options, TestKey(1, 2), "v12"); + PutKey(db.get(), write_options, TestKey(1, 4), "v14"); + PutKey(db.get(), write_options, TestKey(3, 3), "v33"); + PutKey(db.get(), write_options, TestKey(3, 4), "v34"); + db->Flush(FlushOptions()); + reinterpret_cast(db.get())->TEST_WaitForFlushMemTable(); + PutKey(db.get(), write_options, TestKey(1, 1), "v11"); + PutKey(db.get(), write_options, TestKey(1, 3), "v13"); + PutKey(db.get(), write_options, TestKey(2, 1), "v21"); + PutKey(db.get(), write_options, TestKey(2, 2), "v22"); + db->Flush(FlushOptions()); + reinterpret_cast(db.get())->TEST_WaitForFlushMemTable(); + std::unique_ptr iter(db->NewIterator(read_options)); + SeekIterator(iter.get(), 1, 5); + iter->Prev(); + ASSERT_EQ(iter->value(), v13); +} + +TEST_F(PrefixTest, PrefixSeekModePrev3) { + // Only for SkipListFactory + // test SeekToLast() with iterate_upper_bound_ in prefix_seek_mode + options.memtable_factory.reset(new SkipListFactory); + options.write_buffer_size = 1024 * 1024; + std::string v14("v14"); + TestKey upper_bound_key = TestKey(1, 5); + std::string s; + Slice upper_bound = TestKeyToSlice(s, upper_bound_key); + + { + DestroyDB(kDbName, Options()); + auto db = OpenDb(); + WriteOptions write_options; + ReadOptions read_options; + read_options.iterate_upper_bound = &upper_bound; + PutKey(db.get(), write_options, TestKey(1, 2), "v12"); + PutKey(db.get(), write_options, TestKey(1, 4), "v14"); + db->Flush(FlushOptions()); + reinterpret_cast(db.get())->TEST_WaitForFlushMemTable(); + PutKey(db.get(), write_options, TestKey(1, 1), "v11"); + PutKey(db.get(), write_options, TestKey(1, 3), "v13"); + PutKey(db.get(), write_options, TestKey(2, 1), "v21"); + PutKey(db.get(), write_options, TestKey(2, 2), "v22"); + db->Flush(FlushOptions()); + reinterpret_cast(db.get())->TEST_WaitForFlushMemTable(); + std::unique_ptr iter(db->NewIterator(read_options)); + iter->SeekToLast(); + ASSERT_EQ(iter->value(), v14); + } + { + DestroyDB(kDbName, Options()); + auto db = OpenDb(); + WriteOptions write_options; + ReadOptions read_options; + read_options.iterate_upper_bound = &upper_bound; + PutKey(db.get(), write_options, TestKey(1, 2), "v12"); + PutKey(db.get(), write_options, TestKey(1, 4), "v14"); + PutKey(db.get(), write_options, TestKey(3, 3), "v33"); + PutKey(db.get(), write_options, TestKey(3, 4), "v34"); + db->Flush(FlushOptions()); + reinterpret_cast(db.get())->TEST_WaitForFlushMemTable(); + PutKey(db.get(), write_options, TestKey(1, 1), "v11"); + PutKey(db.get(), write_options, TestKey(1, 3), "v13"); + db->Flush(FlushOptions()); + reinterpret_cast(db.get())->TEST_WaitForFlushMemTable(); + std::unique_ptr iter(db->NewIterator(read_options)); + iter->SeekToLast(); + ASSERT_EQ(iter->value(), v14); + } +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + ParseCommandLineFlags(&argc, &argv, true); + return RUN_ALL_TESTS(); +} + +#endif // GFLAGS + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, + "SKIPPED as HashSkipList and HashLinkList are not supported in " + "ROCKSDB_LITE\n"); + return 0; +} + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/range_del_aggregator.cc b/src/rocksdb/db/range_del_aggregator.cc new file mode 100644 index 000000000..1f6a7b139 --- /dev/null +++ b/src/rocksdb/db/range_del_aggregator.cc @@ -0,0 +1,484 @@ +// Copyright (c) 2018-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/range_del_aggregator.h" + +#include "db/compaction/compaction_iteration_stats.h" +#include "db/dbformat.h" +#include "db/pinned_iterators_manager.h" +#include "db/range_del_aggregator.h" +#include "db/range_tombstone_fragmenter.h" +#include "db/version_edit.h" +#include "rocksdb/comparator.h" +#include "rocksdb/types.h" +#include "table/internal_iterator.h" +#include "table/scoped_arena_iterator.h" +#include "table/table_builder.h" +#include "util/heap.h" +#include "util/kv_map.h" +#include "util/vector_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +TruncatedRangeDelIterator::TruncatedRangeDelIterator( + std::unique_ptr iter, + const InternalKeyComparator* icmp, const InternalKey* smallest, + const InternalKey* largest) + : iter_(std::move(iter)), + icmp_(icmp), + smallest_ikey_(smallest), + largest_ikey_(largest) { + if (smallest != nullptr) { + pinned_bounds_.emplace_back(); + auto& parsed_smallest = pinned_bounds_.back(); + if (!ParseInternalKey(smallest->Encode(), &parsed_smallest)) { + assert(false); + } + smallest_ = &parsed_smallest; + } + if (largest != nullptr) { + pinned_bounds_.emplace_back(); + auto& parsed_largest = pinned_bounds_.back(); + if (!ParseInternalKey(largest->Encode(), &parsed_largest)) { + assert(false); + } + if (parsed_largest.type == kTypeRangeDeletion && + parsed_largest.sequence == kMaxSequenceNumber) { + // The file boundary has been artificially extended by a range tombstone. + // We do not need to adjust largest to properly truncate range + // tombstones that extend past the boundary. + } else if (parsed_largest.sequence == 0) { + // The largest key in the sstable has a sequence number of 0. Since we + // guarantee that no internal keys with the same user key and sequence + // number can exist in a DB, we know that the largest key in this sstable + // cannot exist as the smallest key in the next sstable. This further + // implies that no range tombstone in this sstable covers largest; + // otherwise, the file boundary would have been artificially extended. + // + // Therefore, we will never truncate a range tombstone at largest, so we + // can leave it unchanged. + } else { + // The same user key may straddle two sstable boundaries. To ensure that + // the truncated end key can cover the largest key in this sstable, reduce + // its sequence number by 1. + parsed_largest.sequence -= 1; + } + largest_ = &parsed_largest; + } +} + +bool TruncatedRangeDelIterator::Valid() const { + return iter_->Valid() && + (smallest_ == nullptr || + icmp_->Compare(*smallest_, iter_->parsed_end_key()) < 0) && + (largest_ == nullptr || + icmp_->Compare(iter_->parsed_start_key(), *largest_) < 0); +} + +void TruncatedRangeDelIterator::Next() { iter_->TopNext(); } + +void TruncatedRangeDelIterator::Prev() { iter_->TopPrev(); } + +void TruncatedRangeDelIterator::InternalNext() { iter_->Next(); } + +// NOTE: target is a user key +void TruncatedRangeDelIterator::Seek(const Slice& target) { + if (largest_ != nullptr && + icmp_->Compare(*largest_, ParsedInternalKey(target, kMaxSequenceNumber, + kTypeRangeDeletion)) <= 0) { + iter_->Invalidate(); + return; + } + if (smallest_ != nullptr && + icmp_->user_comparator()->Compare(target, smallest_->user_key) < 0) { + iter_->Seek(smallest_->user_key); + return; + } + iter_->Seek(target); +} + +// NOTE: target is a user key +void TruncatedRangeDelIterator::SeekForPrev(const Slice& target) { + if (smallest_ != nullptr && + icmp_->Compare(ParsedInternalKey(target, 0, kTypeRangeDeletion), + *smallest_) < 0) { + iter_->Invalidate(); + return; + } + if (largest_ != nullptr && + icmp_->user_comparator()->Compare(largest_->user_key, target) < 0) { + iter_->SeekForPrev(largest_->user_key); + return; + } + iter_->SeekForPrev(target); +} + +void TruncatedRangeDelIterator::SeekToFirst() { + if (smallest_ != nullptr) { + iter_->Seek(smallest_->user_key); + return; + } + iter_->SeekToTopFirst(); +} + +void TruncatedRangeDelIterator::SeekToLast() { + if (largest_ != nullptr) { + iter_->SeekForPrev(largest_->user_key); + return; + } + iter_->SeekToTopLast(); +} + +std::map> +TruncatedRangeDelIterator::SplitBySnapshot( + const std::vector& snapshots) { + using FragmentedIterPair = + std::pair>; + + auto split_untruncated_iters = iter_->SplitBySnapshot(snapshots); + std::map> + split_truncated_iters; + std::for_each( + split_untruncated_iters.begin(), split_untruncated_iters.end(), + [&](FragmentedIterPair& iter_pair) { + std::unique_ptr truncated_iter( + new TruncatedRangeDelIterator(std::move(iter_pair.second), icmp_, + smallest_ikey_, largest_ikey_)); + split_truncated_iters.emplace(iter_pair.first, + std::move(truncated_iter)); + }); + return split_truncated_iters; +} + +ForwardRangeDelIterator::ForwardRangeDelIterator( + const InternalKeyComparator* icmp) + : icmp_(icmp), + unused_idx_(0), + active_seqnums_(SeqMaxComparator()), + active_iters_(EndKeyMinComparator(icmp)), + inactive_iters_(StartKeyMinComparator(icmp)) {} + +bool ForwardRangeDelIterator::ShouldDelete(const ParsedInternalKey& parsed) { + // Move active iterators that end before parsed. + while (!active_iters_.empty() && + icmp_->Compare((*active_iters_.top())->end_key(), parsed) <= 0) { + TruncatedRangeDelIterator* iter = PopActiveIter(); + do { + iter->Next(); + } while (iter->Valid() && icmp_->Compare(iter->end_key(), parsed) <= 0); + PushIter(iter, parsed); + assert(active_iters_.size() == active_seqnums_.size()); + } + + // Move inactive iterators that start before parsed. + while (!inactive_iters_.empty() && + icmp_->Compare(inactive_iters_.top()->start_key(), parsed) <= 0) { + TruncatedRangeDelIterator* iter = PopInactiveIter(); + while (iter->Valid() && icmp_->Compare(iter->end_key(), parsed) <= 0) { + iter->Next(); + } + PushIter(iter, parsed); + assert(active_iters_.size() == active_seqnums_.size()); + } + + return active_seqnums_.empty() + ? false + : (*active_seqnums_.begin())->seq() > parsed.sequence; +} + +void ForwardRangeDelIterator::Invalidate() { + unused_idx_ = 0; + active_iters_.clear(); + active_seqnums_.clear(); + inactive_iters_.clear(); +} + +ReverseRangeDelIterator::ReverseRangeDelIterator( + const InternalKeyComparator* icmp) + : icmp_(icmp), + unused_idx_(0), + active_seqnums_(SeqMaxComparator()), + active_iters_(StartKeyMaxComparator(icmp)), + inactive_iters_(EndKeyMaxComparator(icmp)) {} + +bool ReverseRangeDelIterator::ShouldDelete(const ParsedInternalKey& parsed) { + // Move active iterators that start after parsed. + while (!active_iters_.empty() && + icmp_->Compare(parsed, (*active_iters_.top())->start_key()) < 0) { + TruncatedRangeDelIterator* iter = PopActiveIter(); + do { + iter->Prev(); + } while (iter->Valid() && icmp_->Compare(parsed, iter->start_key()) < 0); + PushIter(iter, parsed); + assert(active_iters_.size() == active_seqnums_.size()); + } + + // Move inactive iterators that end after parsed. + while (!inactive_iters_.empty() && + icmp_->Compare(parsed, inactive_iters_.top()->end_key()) < 0) { + TruncatedRangeDelIterator* iter = PopInactiveIter(); + while (iter->Valid() && icmp_->Compare(parsed, iter->start_key()) < 0) { + iter->Prev(); + } + PushIter(iter, parsed); + assert(active_iters_.size() == active_seqnums_.size()); + } + + return active_seqnums_.empty() + ? false + : (*active_seqnums_.begin())->seq() > parsed.sequence; +} + +void ReverseRangeDelIterator::Invalidate() { + unused_idx_ = 0; + active_iters_.clear(); + active_seqnums_.clear(); + inactive_iters_.clear(); +} + +bool RangeDelAggregator::StripeRep::ShouldDelete( + const ParsedInternalKey& parsed, RangeDelPositioningMode mode) { + if (!InStripe(parsed.sequence) || IsEmpty()) { + return false; + } + switch (mode) { + case RangeDelPositioningMode::kForwardTraversal: + InvalidateReverseIter(); + + // Pick up previously unseen iterators. + for (auto it = std::next(iters_.begin(), forward_iter_.UnusedIdx()); + it != iters_.end(); ++it, forward_iter_.IncUnusedIdx()) { + auto& iter = *it; + forward_iter_.AddNewIter(iter.get(), parsed); + } + + return forward_iter_.ShouldDelete(parsed); + case RangeDelPositioningMode::kBackwardTraversal: + InvalidateForwardIter(); + + // Pick up previously unseen iterators. + for (auto it = std::next(iters_.begin(), reverse_iter_.UnusedIdx()); + it != iters_.end(); ++it, reverse_iter_.IncUnusedIdx()) { + auto& iter = *it; + reverse_iter_.AddNewIter(iter.get(), parsed); + } + + return reverse_iter_.ShouldDelete(parsed); + default: + assert(false); + return false; + } +} + +bool RangeDelAggregator::StripeRep::IsRangeOverlapped(const Slice& start, + const Slice& end) { + Invalidate(); + + // Set the internal start/end keys so that: + // - if start_ikey has the same user key and sequence number as the + // current end key, start_ikey will be considered greater; and + // - if end_ikey has the same user key and sequence number as the current + // start key, end_ikey will be considered greater. + ParsedInternalKey start_ikey(start, kMaxSequenceNumber, + static_cast(0)); + ParsedInternalKey end_ikey(end, 0, static_cast(0)); + for (auto& iter : iters_) { + bool checked_candidate_tombstones = false; + for (iter->SeekForPrev(start); + iter->Valid() && icmp_->Compare(iter->start_key(), end_ikey) <= 0; + iter->Next()) { + checked_candidate_tombstones = true; + if (icmp_->Compare(start_ikey, iter->end_key()) < 0 && + icmp_->Compare(iter->start_key(), end_ikey) <= 0) { + return true; + } + } + + if (!checked_candidate_tombstones) { + // Do an additional check for when the end of the range is the begin + // key of a tombstone, which we missed earlier since SeekForPrev'ing + // to the start was invalid. + iter->SeekForPrev(end); + if (iter->Valid() && icmp_->Compare(start_ikey, iter->end_key()) < 0 && + icmp_->Compare(iter->start_key(), end_ikey) <= 0) { + return true; + } + } + } + return false; +} + +void ReadRangeDelAggregator::AddTombstones( + std::unique_ptr input_iter, + const InternalKey* smallest, const InternalKey* largest) { + if (input_iter == nullptr || input_iter->empty()) { + return; + } + rep_.AddTombstones( + std::unique_ptr(new TruncatedRangeDelIterator( + std::move(input_iter), icmp_, smallest, largest))); +} + +bool ReadRangeDelAggregator::ShouldDeleteImpl(const ParsedInternalKey& parsed, + RangeDelPositioningMode mode) { + return rep_.ShouldDelete(parsed, mode); +} + +bool ReadRangeDelAggregator::IsRangeOverlapped(const Slice& start, + const Slice& end) { + InvalidateRangeDelMapPositions(); + return rep_.IsRangeOverlapped(start, end); +} + +void CompactionRangeDelAggregator::AddTombstones( + std::unique_ptr input_iter, + const InternalKey* smallest, const InternalKey* largest) { + if (input_iter == nullptr || input_iter->empty()) { + return; + } + assert(input_iter->lower_bound() == 0); + assert(input_iter->upper_bound() == kMaxSequenceNumber); + parent_iters_.emplace_back(new TruncatedRangeDelIterator( + std::move(input_iter), icmp_, smallest, largest)); + + auto split_iters = parent_iters_.back()->SplitBySnapshot(*snapshots_); + for (auto& split_iter : split_iters) { + auto it = reps_.find(split_iter.first); + if (it == reps_.end()) { + bool inserted; + SequenceNumber upper_bound = split_iter.second->upper_bound(); + SequenceNumber lower_bound = split_iter.second->lower_bound(); + std::tie(it, inserted) = reps_.emplace( + split_iter.first, StripeRep(icmp_, upper_bound, lower_bound)); + assert(inserted); + } + assert(it != reps_.end()); + it->second.AddTombstones(std::move(split_iter.second)); + } +} + +bool CompactionRangeDelAggregator::ShouldDelete(const ParsedInternalKey& parsed, + RangeDelPositioningMode mode) { + auto it = reps_.lower_bound(parsed.sequence); + if (it == reps_.end()) { + return false; + } + return it->second.ShouldDelete(parsed, mode); +} + +namespace { + +class TruncatedRangeDelMergingIter : public InternalIterator { + public: + TruncatedRangeDelMergingIter( + const InternalKeyComparator* icmp, const Slice* lower_bound, + const Slice* upper_bound, bool upper_bound_inclusive, + const std::vector>& children) + : icmp_(icmp), + lower_bound_(lower_bound), + upper_bound_(upper_bound), + upper_bound_inclusive_(upper_bound_inclusive), + heap_(StartKeyMinComparator(icmp)) { + for (auto& child : children) { + if (child != nullptr) { + assert(child->lower_bound() == 0); + assert(child->upper_bound() == kMaxSequenceNumber); + children_.push_back(child.get()); + } + } + } + + bool Valid() const override { + return !heap_.empty() && BeforeEndKey(heap_.top()); + } + Status status() const override { return Status::OK(); } + + void SeekToFirst() override { + heap_.clear(); + for (auto& child : children_) { + if (lower_bound_ != nullptr) { + child->Seek(*lower_bound_); + } else { + child->SeekToFirst(); + } + if (child->Valid()) { + heap_.push(child); + } + } + } + + void Next() override { + auto* top = heap_.top(); + top->InternalNext(); + if (top->Valid()) { + heap_.replace_top(top); + } else { + heap_.pop(); + } + } + + Slice key() const override { + auto* top = heap_.top(); + cur_start_key_.Set(top->start_key().user_key, top->seq(), + kTypeRangeDeletion); + return cur_start_key_.Encode(); + } + + Slice value() const override { + auto* top = heap_.top(); + assert(top->end_key().sequence == kMaxSequenceNumber); + return top->end_key().user_key; + } + + // Unused InternalIterator methods + void Prev() override { assert(false); } + void Seek(const Slice& /* target */) override { assert(false); } + void SeekForPrev(const Slice& /* target */) override { assert(false); } + void SeekToLast() override { assert(false); } + + private: + bool BeforeEndKey(const TruncatedRangeDelIterator* iter) const { + if (upper_bound_ == nullptr) { + return true; + } + int cmp = icmp_->user_comparator()->Compare(iter->start_key().user_key, + *upper_bound_); + return upper_bound_inclusive_ ? cmp <= 0 : cmp < 0; + } + + const InternalKeyComparator* icmp_; + const Slice* lower_bound_; + const Slice* upper_bound_; + bool upper_bound_inclusive_; + BinaryHeap heap_; + std::vector children_; + + mutable InternalKey cur_start_key_; +}; + +} // namespace + +std::unique_ptr +CompactionRangeDelAggregator::NewIterator(const Slice* lower_bound, + const Slice* upper_bound, + bool upper_bound_inclusive) { + InvalidateRangeDelMapPositions(); + std::unique_ptr merging_iter( + new TruncatedRangeDelMergingIter(icmp_, lower_bound, upper_bound, + upper_bound_inclusive, parent_iters_)); + + auto fragmented_tombstone_list = + std::make_shared( + std::move(merging_iter), *icmp_, true /* for_compaction */, + *snapshots_); + + return std::unique_ptr( + new FragmentedRangeTombstoneIterator( + fragmented_tombstone_list, *icmp_, + kMaxSequenceNumber /* upper_bound */)); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/range_del_aggregator.h b/src/rocksdb/db/range_del_aggregator.h new file mode 100644 index 000000000..b47cf31d3 --- /dev/null +++ b/src/rocksdb/db/range_del_aggregator.h @@ -0,0 +1,441 @@ +// Copyright (c) 2018-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "db/compaction/compaction_iteration_stats.h" +#include "db/dbformat.h" +#include "db/pinned_iterators_manager.h" +#include "db/range_del_aggregator.h" +#include "db/range_tombstone_fragmenter.h" +#include "db/version_edit.h" +#include "rocksdb/comparator.h" +#include "rocksdb/types.h" +#include "table/internal_iterator.h" +#include "table/scoped_arena_iterator.h" +#include "table/table_builder.h" +#include "util/heap.h" +#include "util/kv_map.h" + +namespace ROCKSDB_NAMESPACE { + +class TruncatedRangeDelIterator { + public: + TruncatedRangeDelIterator( + std::unique_ptr iter, + const InternalKeyComparator* icmp, const InternalKey* smallest, + const InternalKey* largest); + + bool Valid() const; + + void Next(); + void Prev(); + + void InternalNext(); + + // Seeks to the tombstone with the highest viisble sequence number that covers + // target (a user key). If no such tombstone exists, the position will be at + // the earliest tombstone that ends after target. + void Seek(const Slice& target); + + // Seeks to the tombstone with the highest viisble sequence number that covers + // target (a user key). If no such tombstone exists, the position will be at + // the latest tombstone that starts before target. + void SeekForPrev(const Slice& target); + + void SeekToFirst(); + void SeekToLast(); + + ParsedInternalKey start_key() const { + return (smallest_ == nullptr || + icmp_->Compare(*smallest_, iter_->parsed_start_key()) <= 0) + ? iter_->parsed_start_key() + : *smallest_; + } + + ParsedInternalKey end_key() const { + return (largest_ == nullptr || + icmp_->Compare(iter_->parsed_end_key(), *largest_) <= 0) + ? iter_->parsed_end_key() + : *largest_; + } + + SequenceNumber seq() const { return iter_->seq(); } + + std::map> + SplitBySnapshot(const std::vector& snapshots); + + SequenceNumber upper_bound() const { return iter_->upper_bound(); } + + SequenceNumber lower_bound() const { return iter_->lower_bound(); } + + private: + std::unique_ptr iter_; + const InternalKeyComparator* icmp_; + const ParsedInternalKey* smallest_ = nullptr; + const ParsedInternalKey* largest_ = nullptr; + std::list pinned_bounds_; + + const InternalKey* smallest_ikey_; + const InternalKey* largest_ikey_; +}; + +struct SeqMaxComparator { + bool operator()(const TruncatedRangeDelIterator* a, + const TruncatedRangeDelIterator* b) const { + return a->seq() > b->seq(); + } +}; + +struct StartKeyMinComparator { + explicit StartKeyMinComparator(const InternalKeyComparator* c) : icmp(c) {} + + bool operator()(const TruncatedRangeDelIterator* a, + const TruncatedRangeDelIterator* b) const { + return icmp->Compare(a->start_key(), b->start_key()) > 0; + } + + const InternalKeyComparator* icmp; +}; + +class ForwardRangeDelIterator { + public: + explicit ForwardRangeDelIterator(const InternalKeyComparator* icmp); + + bool ShouldDelete(const ParsedInternalKey& parsed); + void Invalidate(); + + void AddNewIter(TruncatedRangeDelIterator* iter, + const ParsedInternalKey& parsed) { + iter->Seek(parsed.user_key); + PushIter(iter, parsed); + assert(active_iters_.size() == active_seqnums_.size()); + } + + size_t UnusedIdx() const { return unused_idx_; } + void IncUnusedIdx() { unused_idx_++; } + + private: + using ActiveSeqSet = + std::multiset; + + struct EndKeyMinComparator { + explicit EndKeyMinComparator(const InternalKeyComparator* c) : icmp(c) {} + + bool operator()(const ActiveSeqSet::const_iterator& a, + const ActiveSeqSet::const_iterator& b) const { + return icmp->Compare((*a)->end_key(), (*b)->end_key()) > 0; + } + + const InternalKeyComparator* icmp; + }; + + void PushIter(TruncatedRangeDelIterator* iter, + const ParsedInternalKey& parsed) { + if (!iter->Valid()) { + // The iterator has been fully consumed, so we don't need to add it to + // either of the heaps. + return; + } + int cmp = icmp_->Compare(parsed, iter->start_key()); + if (cmp < 0) { + PushInactiveIter(iter); + } else { + PushActiveIter(iter); + } + } + + void PushActiveIter(TruncatedRangeDelIterator* iter) { + auto seq_pos = active_seqnums_.insert(iter); + active_iters_.push(seq_pos); + } + + TruncatedRangeDelIterator* PopActiveIter() { + auto active_top = active_iters_.top(); + auto iter = *active_top; + active_iters_.pop(); + active_seqnums_.erase(active_top); + return iter; + } + + void PushInactiveIter(TruncatedRangeDelIterator* iter) { + inactive_iters_.push(iter); + } + + TruncatedRangeDelIterator* PopInactiveIter() { + auto* iter = inactive_iters_.top(); + inactive_iters_.pop(); + return iter; + } + + const InternalKeyComparator* icmp_; + size_t unused_idx_; + ActiveSeqSet active_seqnums_; + BinaryHeap active_iters_; + BinaryHeap inactive_iters_; +}; + +class ReverseRangeDelIterator { + public: + explicit ReverseRangeDelIterator(const InternalKeyComparator* icmp); + + bool ShouldDelete(const ParsedInternalKey& parsed); + void Invalidate(); + + void AddNewIter(TruncatedRangeDelIterator* iter, + const ParsedInternalKey& parsed) { + iter->SeekForPrev(parsed.user_key); + PushIter(iter, parsed); + assert(active_iters_.size() == active_seqnums_.size()); + } + + size_t UnusedIdx() const { return unused_idx_; } + void IncUnusedIdx() { unused_idx_++; } + + private: + using ActiveSeqSet = + std::multiset; + + struct EndKeyMaxComparator { + explicit EndKeyMaxComparator(const InternalKeyComparator* c) : icmp(c) {} + + bool operator()(const TruncatedRangeDelIterator* a, + const TruncatedRangeDelIterator* b) const { + return icmp->Compare(a->end_key(), b->end_key()) < 0; + } + + const InternalKeyComparator* icmp; + }; + struct StartKeyMaxComparator { + explicit StartKeyMaxComparator(const InternalKeyComparator* c) : icmp(c) {} + + bool operator()(const ActiveSeqSet::const_iterator& a, + const ActiveSeqSet::const_iterator& b) const { + return icmp->Compare((*a)->start_key(), (*b)->start_key()) < 0; + } + + const InternalKeyComparator* icmp; + }; + + void PushIter(TruncatedRangeDelIterator* iter, + const ParsedInternalKey& parsed) { + if (!iter->Valid()) { + // The iterator has been fully consumed, so we don't need to add it to + // either of the heaps. + } else if (icmp_->Compare(iter->end_key(), parsed) <= 0) { + PushInactiveIter(iter); + } else { + PushActiveIter(iter); + } + } + + void PushActiveIter(TruncatedRangeDelIterator* iter) { + auto seq_pos = active_seqnums_.insert(iter); + active_iters_.push(seq_pos); + } + + TruncatedRangeDelIterator* PopActiveIter() { + auto active_top = active_iters_.top(); + auto iter = *active_top; + active_iters_.pop(); + active_seqnums_.erase(active_top); + return iter; + } + + void PushInactiveIter(TruncatedRangeDelIterator* iter) { + inactive_iters_.push(iter); + } + + TruncatedRangeDelIterator* PopInactiveIter() { + auto* iter = inactive_iters_.top(); + inactive_iters_.pop(); + return iter; + } + + const InternalKeyComparator* icmp_; + size_t unused_idx_; + ActiveSeqSet active_seqnums_; + BinaryHeap active_iters_; + BinaryHeap inactive_iters_; +}; + +enum class RangeDelPositioningMode { kForwardTraversal, kBackwardTraversal }; +class RangeDelAggregator { + public: + explicit RangeDelAggregator(const InternalKeyComparator* icmp) + : icmp_(icmp) {} + virtual ~RangeDelAggregator() {} + + virtual void AddTombstones( + std::unique_ptr input_iter, + const InternalKey* smallest = nullptr, + const InternalKey* largest = nullptr) = 0; + + bool ShouldDelete(const Slice& key, RangeDelPositioningMode mode) { + ParsedInternalKey parsed; + if (!ParseInternalKey(key, &parsed)) { + return false; + } + return ShouldDelete(parsed, mode); + } + virtual bool ShouldDelete(const ParsedInternalKey& parsed, + RangeDelPositioningMode mode) = 0; + + virtual void InvalidateRangeDelMapPositions() = 0; + + virtual bool IsEmpty() const = 0; + + bool AddFile(uint64_t file_number) { + return files_seen_.insert(file_number).second; + } + + protected: + class StripeRep { + public: + StripeRep(const InternalKeyComparator* icmp, SequenceNumber upper_bound, + SequenceNumber lower_bound) + : icmp_(icmp), + forward_iter_(icmp), + reverse_iter_(icmp), + upper_bound_(upper_bound), + lower_bound_(lower_bound) {} + + void AddTombstones(std::unique_ptr input_iter) { + iters_.push_back(std::move(input_iter)); + } + + bool IsEmpty() const { return iters_.empty(); } + + bool ShouldDelete(const ParsedInternalKey& parsed, + RangeDelPositioningMode mode); + + void Invalidate() { + if (!IsEmpty()) { + InvalidateForwardIter(); + InvalidateReverseIter(); + } + } + + bool IsRangeOverlapped(const Slice& start, const Slice& end); + + private: + bool InStripe(SequenceNumber seq) const { + return lower_bound_ <= seq && seq <= upper_bound_; + } + + void InvalidateForwardIter() { forward_iter_.Invalidate(); } + + void InvalidateReverseIter() { reverse_iter_.Invalidate(); } + + const InternalKeyComparator* icmp_; + std::vector> iters_; + ForwardRangeDelIterator forward_iter_; + ReverseRangeDelIterator reverse_iter_; + SequenceNumber upper_bound_; + SequenceNumber lower_bound_; + }; + + const InternalKeyComparator* icmp_; + + private: + std::set files_seen_; +}; + +class ReadRangeDelAggregator final : public RangeDelAggregator { + public: + ReadRangeDelAggregator(const InternalKeyComparator* icmp, + SequenceNumber upper_bound) + : RangeDelAggregator(icmp), + rep_(icmp, upper_bound, 0 /* lower_bound */) {} + ~ReadRangeDelAggregator() override {} + + using RangeDelAggregator::ShouldDelete; + void AddTombstones( + std::unique_ptr input_iter, + const InternalKey* smallest = nullptr, + const InternalKey* largest = nullptr) override; + + bool ShouldDelete(const ParsedInternalKey& parsed, + RangeDelPositioningMode mode) final override { + if (rep_.IsEmpty()) { + return false; + } + return ShouldDeleteImpl(parsed, mode); + } + + bool IsRangeOverlapped(const Slice& start, const Slice& end); + + void InvalidateRangeDelMapPositions() override { rep_.Invalidate(); } + + bool IsEmpty() const override { return rep_.IsEmpty(); } + + private: + StripeRep rep_; + + bool ShouldDeleteImpl(const ParsedInternalKey& parsed, + RangeDelPositioningMode mode); +}; + +class CompactionRangeDelAggregator : public RangeDelAggregator { + public: + CompactionRangeDelAggregator(const InternalKeyComparator* icmp, + const std::vector& snapshots) + : RangeDelAggregator(icmp), snapshots_(&snapshots) {} + ~CompactionRangeDelAggregator() override {} + + void AddTombstones( + std::unique_ptr input_iter, + const InternalKey* smallest = nullptr, + const InternalKey* largest = nullptr) override; + + using RangeDelAggregator::ShouldDelete; + bool ShouldDelete(const ParsedInternalKey& parsed, + RangeDelPositioningMode mode) override; + + bool IsRangeOverlapped(const Slice& start, const Slice& end); + + void InvalidateRangeDelMapPositions() override { + for (auto& rep : reps_) { + rep.second.Invalidate(); + } + } + + bool IsEmpty() const override { + for (const auto& rep : reps_) { + if (!rep.second.IsEmpty()) { + return false; + } + } + return true; + } + + // Creates an iterator over all the range tombstones in the aggregator, for + // use in compaction. Nullptr arguments indicate that the iterator range is + // unbounded. + // NOTE: the boundaries are used for optimization purposes to reduce the + // number of tombstones that are passed to the fragmenter; they do not + // guarantee that the resulting iterator only contains range tombstones that + // cover keys in the provided range. If required, these bounds must be + // enforced during iteration. + std::unique_ptr NewIterator( + const Slice* lower_bound = nullptr, const Slice* upper_bound = nullptr, + bool upper_bound_inclusive = false); + + private: + std::vector> parent_iters_; + std::map reps_; + + const std::vector* snapshots_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/range_del_aggregator_bench.cc b/src/rocksdb/db/range_del_aggregator_bench.cc new file mode 100644 index 000000000..3f3135f2e --- /dev/null +++ b/src/rocksdb/db/range_del_aggregator_bench.cc @@ -0,0 +1,260 @@ +// Copyright (c) 2018-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef GFLAGS +#include +int main() { + fprintf(stderr, "Please install gflags to run rocksdb tools\n"); + return 1; +} +#else + +#include +#include +#include +#include +#include +#include +#include + +#include "db/range_del_aggregator.h" +#include "db/range_tombstone_fragmenter.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "test_util/testutil.h" +#include "util/coding.h" +#include "util/random.h" +#include "util/stop_watch.h" + +#include "util/gflags_compat.h" + +using GFLAGS_NAMESPACE::ParseCommandLineFlags; + +DEFINE_int32(num_range_tombstones, 1000, "number of range tombstones created"); + +DEFINE_int32(num_runs, 1000, "number of test runs"); + +DEFINE_int32(tombstone_start_upper_bound, 1000, + "exclusive upper bound on range tombstone start keys"); + +DEFINE_int32(should_delete_upper_bound, 1000, + "exclusive upper bound on keys passed to ShouldDelete"); + +DEFINE_double(tombstone_width_mean, 100.0, "average range tombstone width"); + +DEFINE_double(tombstone_width_stddev, 0.0, + "standard deviation of range tombstone width"); + +DEFINE_int32(seed, 0, "random number generator seed"); + +DEFINE_int32(should_deletes_per_run, 1, "number of ShouldDelete calls per run"); + +DEFINE_int32(add_tombstones_per_run, 1, + "number of AddTombstones calls per run"); + +namespace { + +struct Stats { + uint64_t time_add_tombstones = 0; + uint64_t time_first_should_delete = 0; + uint64_t time_rest_should_delete = 0; +}; + +std::ostream& operator<<(std::ostream& os, const Stats& s) { + std::ios fmt_holder(nullptr); + fmt_holder.copyfmt(os); + + os << std::left; + os << std::setw(25) << "AddTombstones: " + << s.time_add_tombstones / + (FLAGS_add_tombstones_per_run * FLAGS_num_runs * 1.0e3) + << " us\n"; + os << std::setw(25) << "ShouldDelete (first): " + << s.time_first_should_delete / (FLAGS_num_runs * 1.0e3) << " us\n"; + if (FLAGS_should_deletes_per_run > 1) { + os << std::setw(25) << "ShouldDelete (rest): " + << s.time_rest_should_delete / + ((FLAGS_should_deletes_per_run - 1) * FLAGS_num_runs * 1.0e3) + << " us\n"; + } + + os.copyfmt(fmt_holder); + return os; +} + +auto icmp = ROCKSDB_NAMESPACE::InternalKeyComparator( + ROCKSDB_NAMESPACE::BytewiseComparator()); + +} // anonymous namespace + +namespace ROCKSDB_NAMESPACE { + +namespace { + +// A wrapper around RangeTombstones and the underlying data of its start and end +// keys. +struct PersistentRangeTombstone { + std::string start_key; + std::string end_key; + RangeTombstone tombstone; + + PersistentRangeTombstone(std::string start, std::string end, + SequenceNumber seq) + : start_key(std::move(start)), end_key(std::move(end)) { + tombstone = RangeTombstone(start_key, end_key, seq); + } + + PersistentRangeTombstone() = default; + + PersistentRangeTombstone(const PersistentRangeTombstone& t) { *this = t; } + + PersistentRangeTombstone& operator=(const PersistentRangeTombstone& t) { + start_key = t.start_key; + end_key = t.end_key; + tombstone = RangeTombstone(start_key, end_key, t.tombstone.seq_); + + return *this; + } + + PersistentRangeTombstone(PersistentRangeTombstone&& t) noexcept { *this = t; } + + PersistentRangeTombstone& operator=(PersistentRangeTombstone&& t) { + start_key = std::move(t.start_key); + end_key = std::move(t.end_key); + tombstone = RangeTombstone(start_key, end_key, t.tombstone.seq_); + + return *this; + } +}; + +struct TombstoneStartKeyComparator { + explicit TombstoneStartKeyComparator(const Comparator* c) : cmp(c) {} + + bool operator()(const RangeTombstone& a, const RangeTombstone& b) const { + return cmp->Compare(a.start_key_, b.start_key_) < 0; + } + + const Comparator* cmp; +}; + +std::unique_ptr MakeRangeDelIterator( + const std::vector& range_dels) { + std::vector keys, values; + for (const auto& range_del : range_dels) { + auto key_and_value = range_del.tombstone.Serialize(); + keys.push_back(key_and_value.first.Encode().ToString()); + values.push_back(key_and_value.second.ToString()); + } + return std::unique_ptr( + new test::VectorIterator(keys, values)); +} + +// convert long to a big-endian slice key +static std::string Key(int64_t val) { + std::string little_endian_key; + std::string big_endian_key; + PutFixed64(&little_endian_key, val); + assert(little_endian_key.size() == sizeof(val)); + big_endian_key.resize(sizeof(val)); + for (size_t i = 0; i < sizeof(val); ++i) { + big_endian_key[i] = little_endian_key[sizeof(val) - 1 - i]; + } + return big_endian_key; +} + +} // anonymous namespace + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ParseCommandLineFlags(&argc, &argv, true); + + Stats stats; + ROCKSDB_NAMESPACE::Random64 rnd(FLAGS_seed); + std::default_random_engine random_gen(FLAGS_seed); + std::normal_distribution normal_dist(FLAGS_tombstone_width_mean, + FLAGS_tombstone_width_stddev); + std::vector > + all_persistent_range_tombstones(FLAGS_add_tombstones_per_run); + for (int i = 0; i < FLAGS_add_tombstones_per_run; i++) { + all_persistent_range_tombstones[i] = + std::vector( + FLAGS_num_range_tombstones); + } + auto mode = ROCKSDB_NAMESPACE::RangeDelPositioningMode::kForwardTraversal; + + for (int i = 0; i < FLAGS_num_runs; i++) { + ROCKSDB_NAMESPACE::ReadRangeDelAggregator range_del_agg( + &icmp, ROCKSDB_NAMESPACE::kMaxSequenceNumber /* upper_bound */); + + std::vector< + std::unique_ptr > + fragmented_range_tombstone_lists(FLAGS_add_tombstones_per_run); + + for (auto& persistent_range_tombstones : all_persistent_range_tombstones) { + // TODO(abhimadan): consider whether creating the range tombstones right + // before AddTombstones is artificially warming the cache compared to + // real workloads. + for (int j = 0; j < FLAGS_num_range_tombstones; j++) { + uint64_t start = rnd.Uniform(FLAGS_tombstone_start_upper_bound); + uint64_t end = static_cast( + std::round(start + std::max(1.0, normal_dist(random_gen)))); + persistent_range_tombstones[j] = + ROCKSDB_NAMESPACE::PersistentRangeTombstone( + ROCKSDB_NAMESPACE::Key(start), ROCKSDB_NAMESPACE::Key(end), j); + } + + auto range_del_iter = + ROCKSDB_NAMESPACE::MakeRangeDelIterator(persistent_range_tombstones); + fragmented_range_tombstone_lists.emplace_back( + new ROCKSDB_NAMESPACE::FragmentedRangeTombstoneList( + ROCKSDB_NAMESPACE::MakeRangeDelIterator( + persistent_range_tombstones), + icmp)); + std::unique_ptr + fragmented_range_del_iter( + new ROCKSDB_NAMESPACE::FragmentedRangeTombstoneIterator( + fragmented_range_tombstone_lists.back().get(), icmp, + ROCKSDB_NAMESPACE::kMaxSequenceNumber)); + + ROCKSDB_NAMESPACE::StopWatchNano stop_watch_add_tombstones( + ROCKSDB_NAMESPACE::Env::Default(), true /* auto_start */); + range_del_agg.AddTombstones(std::move(fragmented_range_del_iter)); + stats.time_add_tombstones += stop_watch_add_tombstones.ElapsedNanos(); + } + + ROCKSDB_NAMESPACE::ParsedInternalKey parsed_key; + parsed_key.sequence = FLAGS_num_range_tombstones / 2; + parsed_key.type = ROCKSDB_NAMESPACE::kTypeValue; + + uint64_t first_key = rnd.Uniform(FLAGS_should_delete_upper_bound - + FLAGS_should_deletes_per_run + 1); + + for (int j = 0; j < FLAGS_should_deletes_per_run; j++) { + std::string key_string = ROCKSDB_NAMESPACE::Key(first_key + j); + parsed_key.user_key = key_string; + + ROCKSDB_NAMESPACE::StopWatchNano stop_watch_should_delete( + ROCKSDB_NAMESPACE::Env::Default(), true /* auto_start */); + range_del_agg.ShouldDelete(parsed_key, mode); + uint64_t call_time = stop_watch_should_delete.ElapsedNanos(); + + if (j == 0) { + stats.time_first_should_delete += call_time; + } else { + stats.time_rest_should_delete += call_time; + } + } + } + + std::cout << "=========================\n" + << "Results:\n" + << "=========================\n" + << stats; + + return 0; +} + +#endif // GFLAGS diff --git a/src/rocksdb/db/range_del_aggregator_test.cc b/src/rocksdb/db/range_del_aggregator_test.cc new file mode 100644 index 000000000..0b8b5079c --- /dev/null +++ b/src/rocksdb/db/range_del_aggregator_test.cc @@ -0,0 +1,709 @@ +// Copyright (c) 2018-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/range_del_aggregator.h" + +#include +#include +#include + +#include "db/db_test_util.h" +#include "db/dbformat.h" +#include "db/range_tombstone_fragmenter.h" +#include "test_util/testutil.h" + +namespace ROCKSDB_NAMESPACE { + +class RangeDelAggregatorTest : public testing::Test {}; + +namespace { + +static auto bytewise_icmp = InternalKeyComparator(BytewiseComparator()); + +std::unique_ptr MakeRangeDelIter( + const std::vector& range_dels) { + std::vector keys, values; + for (const auto& range_del : range_dels) { + auto key_and_value = range_del.Serialize(); + keys.push_back(key_and_value.first.Encode().ToString()); + values.push_back(key_and_value.second.ToString()); + } + return std::unique_ptr( + new test::VectorIterator(keys, values)); +} + +std::vector> +MakeFragmentedTombstoneLists( + const std::vector>& range_dels_list) { + std::vector> fragment_lists; + for (const auto& range_dels : range_dels_list) { + auto range_del_iter = MakeRangeDelIter(range_dels); + fragment_lists.emplace_back(new FragmentedRangeTombstoneList( + std::move(range_del_iter), bytewise_icmp)); + } + return fragment_lists; +} + +struct TruncatedIterScanTestCase { + ParsedInternalKey start; + ParsedInternalKey end; + SequenceNumber seq; +}; + +struct TruncatedIterSeekTestCase { + Slice target; + ParsedInternalKey start; + ParsedInternalKey end; + SequenceNumber seq; + bool invalid; +}; + +struct ShouldDeleteTestCase { + ParsedInternalKey lookup_key; + bool result; +}; + +struct IsRangeOverlappedTestCase { + Slice start; + Slice end; + bool result; +}; + +ParsedInternalKey UncutEndpoint(const Slice& s) { + return ParsedInternalKey(s, kMaxSequenceNumber, kTypeRangeDeletion); +} + +ParsedInternalKey InternalValue(const Slice& key, SequenceNumber seq) { + return ParsedInternalKey(key, seq, kTypeValue); +} + +void VerifyIterator( + TruncatedRangeDelIterator* iter, const InternalKeyComparator& icmp, + const std::vector& expected_range_dels) { + // Test forward iteration. + iter->SeekToFirst(); + for (size_t i = 0; i < expected_range_dels.size(); i++, iter->Next()) { + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(0, icmp.Compare(iter->start_key(), expected_range_dels[i].start)); + EXPECT_EQ(0, icmp.Compare(iter->end_key(), expected_range_dels[i].end)); + EXPECT_EQ(expected_range_dels[i].seq, iter->seq()); + } + EXPECT_FALSE(iter->Valid()); + + // Test reverse iteration. + iter->SeekToLast(); + std::vector reverse_expected_range_dels( + expected_range_dels.rbegin(), expected_range_dels.rend()); + for (size_t i = 0; i < reverse_expected_range_dels.size(); + i++, iter->Prev()) { + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(0, icmp.Compare(iter->start_key(), + reverse_expected_range_dels[i].start)); + EXPECT_EQ( + 0, icmp.Compare(iter->end_key(), reverse_expected_range_dels[i].end)); + EXPECT_EQ(reverse_expected_range_dels[i].seq, iter->seq()); + } + EXPECT_FALSE(iter->Valid()); +} + +void VerifySeek(TruncatedRangeDelIterator* iter, + const InternalKeyComparator& icmp, + const std::vector& test_cases) { + for (const auto& test_case : test_cases) { + iter->Seek(test_case.target); + if (test_case.invalid) { + ASSERT_FALSE(iter->Valid()); + } else { + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(0, icmp.Compare(iter->start_key(), test_case.start)); + EXPECT_EQ(0, icmp.Compare(iter->end_key(), test_case.end)); + EXPECT_EQ(test_case.seq, iter->seq()); + } + } +} + +void VerifySeekForPrev( + TruncatedRangeDelIterator* iter, const InternalKeyComparator& icmp, + const std::vector& test_cases) { + for (const auto& test_case : test_cases) { + iter->SeekForPrev(test_case.target); + if (test_case.invalid) { + ASSERT_FALSE(iter->Valid()); + } else { + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(0, icmp.Compare(iter->start_key(), test_case.start)); + EXPECT_EQ(0, icmp.Compare(iter->end_key(), test_case.end)); + EXPECT_EQ(test_case.seq, iter->seq()); + } + } +} + +void VerifyShouldDelete(RangeDelAggregator* range_del_agg, + const std::vector& test_cases) { + for (const auto& test_case : test_cases) { + EXPECT_EQ( + test_case.result, + range_del_agg->ShouldDelete( + test_case.lookup_key, RangeDelPositioningMode::kForwardTraversal)); + } + for (auto it = test_cases.rbegin(); it != test_cases.rend(); ++it) { + const auto& test_case = *it; + EXPECT_EQ( + test_case.result, + range_del_agg->ShouldDelete( + test_case.lookup_key, RangeDelPositioningMode::kBackwardTraversal)); + } +} + +void VerifyIsRangeOverlapped( + ReadRangeDelAggregator* range_del_agg, + const std::vector& test_cases) { + for (const auto& test_case : test_cases) { + EXPECT_EQ(test_case.result, + range_del_agg->IsRangeOverlapped(test_case.start, test_case.end)); + } +} + +void CheckIterPosition(const RangeTombstone& tombstone, + const FragmentedRangeTombstoneIterator* iter) { + // Test InternalIterator interface. + EXPECT_EQ(tombstone.start_key_, ExtractUserKey(iter->key())); + EXPECT_EQ(tombstone.end_key_, iter->value()); + EXPECT_EQ(tombstone.seq_, iter->seq()); + + // Test FragmentedRangeTombstoneIterator interface. + EXPECT_EQ(tombstone.start_key_, iter->start_key()); + EXPECT_EQ(tombstone.end_key_, iter->end_key()); + EXPECT_EQ(tombstone.seq_, GetInternalKeySeqno(iter->key())); +} + +void VerifyFragmentedRangeDels( + FragmentedRangeTombstoneIterator* iter, + const std::vector& expected_tombstones) { + iter->SeekToFirst(); + for (size_t i = 0; i < expected_tombstones.size(); i++, iter->Next()) { + ASSERT_TRUE(iter->Valid()); + CheckIterPosition(expected_tombstones[i], iter); + } + EXPECT_FALSE(iter->Valid()); +} + +} // namespace + +TEST_F(RangeDelAggregatorTest, EmptyTruncatedIter) { + auto range_del_iter = MakeRangeDelIter({}); + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp, + kMaxSequenceNumber)); + + TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr, + nullptr); + + iter.SeekToFirst(); + ASSERT_FALSE(iter.Valid()); + + iter.SeekToLast(); + ASSERT_FALSE(iter.Valid()); +} + +TEST_F(RangeDelAggregatorTest, UntruncatedIter) { + auto range_del_iter = + MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}}); + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp, + kMaxSequenceNumber)); + + TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr, + nullptr); + + VerifyIterator(&iter, bytewise_icmp, + {{UncutEndpoint("a"), UncutEndpoint("e"), 10}, + {UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {UncutEndpoint("j"), UncutEndpoint("n"), 4}}); + + VerifySeek( + &iter, bytewise_icmp, + {{"d", UncutEndpoint("a"), UncutEndpoint("e"), 10}, + {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {"ia", UncutEndpoint("j"), UncutEndpoint("n"), 4}, + {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}, + {"", UncutEndpoint("a"), UncutEndpoint("e"), 10}}); + + VerifySeekForPrev( + &iter, bytewise_icmp, + {{"d", UncutEndpoint("a"), UncutEndpoint("e"), 10}, + {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {"n", UncutEndpoint("j"), UncutEndpoint("n"), 4}, + {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}}); +} + +TEST_F(RangeDelAggregatorTest, UntruncatedIterWithSnapshot) { + auto range_del_iter = + MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}}); + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp, + 9 /* snapshot */)); + + TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr, + nullptr); + + VerifyIterator(&iter, bytewise_icmp, + {{UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {UncutEndpoint("j"), UncutEndpoint("n"), 4}}); + + VerifySeek( + &iter, bytewise_icmp, + {{"d", UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {"ia", UncutEndpoint("j"), UncutEndpoint("n"), 4}, + {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}, + {"", UncutEndpoint("e"), UncutEndpoint("g"), 8}}); + + VerifySeekForPrev( + &iter, bytewise_icmp, + {{"d", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}, + {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {"n", UncutEndpoint("j"), UncutEndpoint("n"), 4}, + {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}}); +} + +TEST_F(RangeDelAggregatorTest, TruncatedIterPartiallyCutTombstones) { + auto range_del_iter = + MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}}); + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp, + kMaxSequenceNumber)); + + InternalKey smallest("d", 7, kTypeValue); + InternalKey largest("m", 9, kTypeValue); + TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, + &smallest, &largest); + + VerifyIterator(&iter, bytewise_icmp, + {{InternalValue("d", 7), UncutEndpoint("e"), 10}, + {UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {UncutEndpoint("j"), InternalValue("m", 8), 4}}); + + VerifySeek( + &iter, bytewise_icmp, + {{"d", InternalValue("d", 7), UncutEndpoint("e"), 10}, + {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {"ia", UncutEndpoint("j"), InternalValue("m", 8), 4}, + {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}, + {"", InternalValue("d", 7), UncutEndpoint("e"), 10}}); + + VerifySeekForPrev( + &iter, bytewise_icmp, + {{"d", InternalValue("d", 7), UncutEndpoint("e"), 10}, + {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {"n", UncutEndpoint("j"), InternalValue("m", 8), 4}, + {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}}); +} + +TEST_F(RangeDelAggregatorTest, TruncatedIterFullyCutTombstones) { + auto range_del_iter = + MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}}); + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp, + kMaxSequenceNumber)); + + InternalKey smallest("f", 7, kTypeValue); + InternalKey largest("i", 9, kTypeValue); + TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, + &smallest, &largest); + + VerifyIterator(&iter, bytewise_icmp, + {{InternalValue("f", 7), UncutEndpoint("g"), 8}}); + + VerifySeek( + &iter, bytewise_icmp, + {{"d", InternalValue("f", 7), UncutEndpoint("g"), 8}, + {"f", InternalValue("f", 7), UncutEndpoint("g"), 8}, + {"j", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}}); + + VerifySeekForPrev( + &iter, bytewise_icmp, + {{"d", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}, + {"f", InternalValue("f", 7), UncutEndpoint("g"), 8}, + {"j", InternalValue("f", 7), UncutEndpoint("g"), 8}}); +} + +TEST_F(RangeDelAggregatorTest, SingleIterInAggregator) { + auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, {"c", "g", 8}}); + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp, + kMaxSequenceNumber)); + + ReadRangeDelAggregator range_del_agg(&bytewise_icmp, kMaxSequenceNumber); + range_del_agg.AddTombstones(std::move(input_iter)); + + VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), false}, + {InternalValue("b", 9), true}, + {InternalValue("d", 9), true}, + {InternalValue("e", 7), true}, + {InternalValue("g", 7), false}}); + + VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false}, + {"_", "a", true}, + {"a", "c", true}, + {"d", "f", true}, + {"g", "l", false}}); +} + +TEST_F(RangeDelAggregatorTest, MultipleItersInAggregator) { + auto fragment_lists = MakeFragmentedTombstoneLists( + {{{"a", "e", 10}, {"c", "g", 8}}, + {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}}); + + ReadRangeDelAggregator range_del_agg(&bytewise_icmp, kMaxSequenceNumber); + for (const auto& fragment_list : fragment_lists) { + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, + kMaxSequenceNumber)); + range_del_agg.AddTombstones(std::move(input_iter)); + } + + VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), true}, + {InternalValue("b", 19), false}, + {InternalValue("b", 9), true}, + {InternalValue("d", 9), true}, + {InternalValue("e", 7), true}, + {InternalValue("g", 7), false}, + {InternalValue("h", 24), true}, + {InternalValue("i", 24), false}, + {InternalValue("ii", 14), true}, + {InternalValue("j", 14), false}}); + + VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false}, + {"_", "a", true}, + {"a", "c", true}, + {"d", "f", true}, + {"g", "l", true}, + {"x", "y", false}}); +} + +TEST_F(RangeDelAggregatorTest, MultipleItersInAggregatorWithUpperBound) { + auto fragment_lists = MakeFragmentedTombstoneLists( + {{{"a", "e", 10}, {"c", "g", 8}}, + {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}}); + + ReadRangeDelAggregator range_del_agg(&bytewise_icmp, 19); + for (const auto& fragment_list : fragment_lists) { + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, + 19 /* snapshot */)); + range_del_agg.AddTombstones(std::move(input_iter)); + } + + VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), false}, + {InternalValue("a", 9), true}, + {InternalValue("b", 9), true}, + {InternalValue("d", 9), true}, + {InternalValue("e", 7), true}, + {InternalValue("g", 7), false}, + {InternalValue("h", 24), false}, + {InternalValue("i", 24), false}, + {InternalValue("ii", 14), true}, + {InternalValue("j", 14), false}}); + + VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false}, + {"_", "a", true}, + {"a", "c", true}, + {"d", "f", true}, + {"g", "l", true}, + {"x", "y", false}}); +} + +TEST_F(RangeDelAggregatorTest, MultipleTruncatedItersInAggregator) { + auto fragment_lists = MakeFragmentedTombstoneLists( + {{{"a", "z", 10}}, {{"a", "z", 10}}, {{"a", "z", 10}}}); + std::vector> iter_bounds = { + {InternalKey("a", 4, kTypeValue), + InternalKey("m", kMaxSequenceNumber, kTypeRangeDeletion)}, + {InternalKey("m", 20, kTypeValue), + InternalKey("x", kMaxSequenceNumber, kTypeRangeDeletion)}, + {InternalKey("x", 5, kTypeValue), InternalKey("zz", 30, kTypeValue)}}; + + ReadRangeDelAggregator range_del_agg(&bytewise_icmp, 19); + for (size_t i = 0; i < fragment_lists.size(); i++) { + const auto& fragment_list = fragment_lists[i]; + const auto& bounds = iter_bounds[i]; + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, + 19 /* snapshot */)); + range_del_agg.AddTombstones(std::move(input_iter), &bounds.first, + &bounds.second); + } + + VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 10), false}, + {InternalValue("a", 9), false}, + {InternalValue("a", 4), true}, + {InternalValue("m", 10), false}, + {InternalValue("m", 9), true}, + {InternalValue("x", 10), false}, + {InternalValue("x", 9), false}, + {InternalValue("x", 5), true}, + {InternalValue("z", 9), false}}); + + VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false}, + {"_", "a", true}, + {"a", "n", true}, + {"l", "x", true}, + {"w", "z", true}, + {"zzz", "zz", false}, + {"zz", "zzz", false}}); +} + +TEST_F(RangeDelAggregatorTest, MultipleTruncatedItersInAggregatorSameLevel) { + auto fragment_lists = MakeFragmentedTombstoneLists( + {{{"a", "z", 10}}, {{"a", "z", 10}}, {{"a", "z", 10}}}); + std::vector> iter_bounds = { + {InternalKey("a", 4, kTypeValue), + InternalKey("m", kMaxSequenceNumber, kTypeRangeDeletion)}, + {InternalKey("m", 20, kTypeValue), + InternalKey("x", kMaxSequenceNumber, kTypeRangeDeletion)}, + {InternalKey("x", 5, kTypeValue), InternalKey("zz", 30, kTypeValue)}}; + + ReadRangeDelAggregator range_del_agg(&bytewise_icmp, 19); + + auto add_iter_to_agg = [&](size_t i) { + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(fragment_lists[i].get(), + bytewise_icmp, 19 /* snapshot */)); + range_del_agg.AddTombstones(std::move(input_iter), &iter_bounds[i].first, + &iter_bounds[i].second); + }; + + add_iter_to_agg(0); + VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 10), false}, + {InternalValue("a", 9), false}, + {InternalValue("a", 4), true}}); + + add_iter_to_agg(1); + VerifyShouldDelete(&range_del_agg, {{InternalValue("m", 10), false}, + {InternalValue("m", 9), true}}); + + add_iter_to_agg(2); + VerifyShouldDelete(&range_del_agg, {{InternalValue("x", 10), false}, + {InternalValue("x", 9), false}, + {InternalValue("x", 5), true}, + {InternalValue("z", 9), false}}); + + VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false}, + {"_", "a", true}, + {"a", "n", true}, + {"l", "x", true}, + {"w", "z", true}, + {"zzz", "zz", false}, + {"zz", "zzz", false}}); +} + +TEST_F(RangeDelAggregatorTest, CompactionAggregatorNoSnapshots) { + auto fragment_lists = MakeFragmentedTombstoneLists( + {{{"a", "e", 10}, {"c", "g", 8}}, + {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}}); + + std::vector snapshots; + CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots); + for (const auto& fragment_list : fragment_lists) { + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, + kMaxSequenceNumber)); + range_del_agg.AddTombstones(std::move(input_iter)); + } + + VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), true}, + {InternalValue("b", 19), false}, + {InternalValue("b", 9), true}, + {InternalValue("d", 9), true}, + {InternalValue("e", 7), true}, + {InternalValue("g", 7), false}, + {InternalValue("h", 24), true}, + {InternalValue("i", 24), false}, + {InternalValue("ii", 14), true}, + {InternalValue("j", 14), false}}); + + auto range_del_compaction_iter = range_del_agg.NewIterator(); + VerifyFragmentedRangeDels(range_del_compaction_iter.get(), {{"a", "b", 20}, + {"b", "c", 10}, + {"c", "e", 10}, + {"e", "g", 8}, + {"h", "i", 25}, + {"ii", "j", 15}}); +} + +TEST_F(RangeDelAggregatorTest, CompactionAggregatorWithSnapshots) { + auto fragment_lists = MakeFragmentedTombstoneLists( + {{{"a", "e", 10}, {"c", "g", 8}}, + {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}}); + + std::vector snapshots{9, 19}; + CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots); + for (const auto& fragment_list : fragment_lists) { + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, + kMaxSequenceNumber)); + range_del_agg.AddTombstones(std::move(input_iter)); + } + + VerifyShouldDelete( + &range_del_agg, + { + {InternalValue("a", 19), false}, // [10, 19] + {InternalValue("a", 9), false}, // [0, 9] + {InternalValue("b", 9), false}, // [0, 9] + {InternalValue("d", 9), false}, // [0, 9] + {InternalValue("d", 7), true}, // [0, 9] + {InternalValue("e", 7), true}, // [0, 9] + {InternalValue("g", 7), false}, // [0, 9] + {InternalValue("h", 24), true}, // [20, kMaxSequenceNumber] + {InternalValue("i", 24), false}, // [20, kMaxSequenceNumber] + {InternalValue("ii", 14), true}, // [10, 19] + {InternalValue("j", 14), false} // [10, 19] + }); + + auto range_del_compaction_iter = range_del_agg.NewIterator(); + VerifyFragmentedRangeDels(range_del_compaction_iter.get(), {{"a", "b", 20}, + {"a", "b", 10}, + {"b", "c", 10}, + {"c", "e", 10}, + {"c", "e", 8}, + {"e", "g", 8}, + {"h", "i", 25}, + {"ii", "j", 15}}); +} + +TEST_F(RangeDelAggregatorTest, CompactionAggregatorEmptyIteratorLeft) { + auto fragment_lists = MakeFragmentedTombstoneLists( + {{{"a", "e", 10}, {"c", "g", 8}}, + {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}}); + + std::vector snapshots{9, 19}; + CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots); + for (const auto& fragment_list : fragment_lists) { + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, + kMaxSequenceNumber)); + range_del_agg.AddTombstones(std::move(input_iter)); + } + + Slice start("_"); + Slice end("__"); +} + +TEST_F(RangeDelAggregatorTest, CompactionAggregatorEmptyIteratorRight) { + auto fragment_lists = MakeFragmentedTombstoneLists( + {{{"a", "e", 10}, {"c", "g", 8}}, + {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}}); + + std::vector snapshots{9, 19}; + CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots); + for (const auto& fragment_list : fragment_lists) { + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, + kMaxSequenceNumber)); + range_del_agg.AddTombstones(std::move(input_iter)); + } + + Slice start("p"); + Slice end("q"); + auto range_del_compaction_iter1 = + range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */); + VerifyFragmentedRangeDels(range_del_compaction_iter1.get(), {}); + + auto range_del_compaction_iter2 = + range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */); + VerifyFragmentedRangeDels(range_del_compaction_iter2.get(), {}); +} + +TEST_F(RangeDelAggregatorTest, CompactionAggregatorBoundedIterator) { + auto fragment_lists = MakeFragmentedTombstoneLists( + {{{"a", "e", 10}, {"c", "g", 8}}, + {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}}); + + std::vector snapshots{9, 19}; + CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots); + for (const auto& fragment_list : fragment_lists) { + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, + kMaxSequenceNumber)); + range_del_agg.AddTombstones(std::move(input_iter)); + } + + Slice start("bb"); + Slice end("e"); + auto range_del_compaction_iter1 = + range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */); + VerifyFragmentedRangeDels(range_del_compaction_iter1.get(), + {{"a", "c", 10}, {"c", "e", 10}, {"c", "e", 8}}); + + auto range_del_compaction_iter2 = + range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */); + VerifyFragmentedRangeDels( + range_del_compaction_iter2.get(), + {{"a", "c", 10}, {"c", "e", 10}, {"c", "e", 8}, {"e", "g", 8}}); +} + +TEST_F(RangeDelAggregatorTest, + CompactionAggregatorBoundedIteratorExtraFragments) { + auto fragment_lists = MakeFragmentedTombstoneLists( + {{{"a", "d", 10}, {"c", "g", 8}}, + {{"b", "c", 20}, {"d", "f", 30}, {"h", "i", 25}, {"ii", "j", 15}}}); + + std::vector snapshots{9, 19}; + CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots); + for (const auto& fragment_list : fragment_lists) { + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, + kMaxSequenceNumber)); + range_del_agg.AddTombstones(std::move(input_iter)); + } + + Slice start("bb"); + Slice end("e"); + auto range_del_compaction_iter1 = + range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */); + VerifyFragmentedRangeDels(range_del_compaction_iter1.get(), {{"a", "b", 10}, + {"b", "c", 20}, + {"b", "c", 10}, + {"c", "d", 10}, + {"c", "d", 8}, + {"d", "f", 30}, + {"d", "f", 8}, + {"f", "g", 8}}); + + auto range_del_compaction_iter2 = + range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */); + VerifyFragmentedRangeDels(range_del_compaction_iter2.get(), {{"a", "b", 10}, + {"b", "c", 20}, + {"b", "c", 10}, + {"c", "d", 10}, + {"c", "d", 8}, + {"d", "f", 30}, + {"d", "f", 8}, + {"f", "g", 8}}); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/range_tombstone_fragmenter.cc b/src/rocksdb/db/range_tombstone_fragmenter.cc new file mode 100644 index 000000000..58426248c --- /dev/null +++ b/src/rocksdb/db/range_tombstone_fragmenter.cc @@ -0,0 +1,439 @@ +// Copyright (c) 2018-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/range_tombstone_fragmenter.h" + +#include +#include +#include + +#include +#include + +#include "util/autovector.h" +#include "util/kv_map.h" +#include "util/vector_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +FragmentedRangeTombstoneList::FragmentedRangeTombstoneList( + std::unique_ptr unfragmented_tombstones, + const InternalKeyComparator& icmp, bool for_compaction, + const std::vector& snapshots) { + if (unfragmented_tombstones == nullptr) { + return; + } + bool is_sorted = true; + int num_tombstones = 0; + InternalKey pinned_last_start_key; + Slice last_start_key; + for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid(); + unfragmented_tombstones->Next(), num_tombstones++) { + if (num_tombstones > 0 && + icmp.Compare(last_start_key, unfragmented_tombstones->key()) > 0) { + is_sorted = false; + break; + } + if (unfragmented_tombstones->IsKeyPinned()) { + last_start_key = unfragmented_tombstones->key(); + } else { + pinned_last_start_key.DecodeFrom(unfragmented_tombstones->key()); + last_start_key = pinned_last_start_key.Encode(); + } + } + if (is_sorted) { + FragmentTombstones(std::move(unfragmented_tombstones), icmp, for_compaction, + snapshots); + return; + } + + // Sort the tombstones before fragmenting them. + std::vector keys, values; + keys.reserve(num_tombstones); + values.reserve(num_tombstones); + for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid(); + unfragmented_tombstones->Next()) { + keys.emplace_back(unfragmented_tombstones->key().data(), + unfragmented_tombstones->key().size()); + values.emplace_back(unfragmented_tombstones->value().data(), + unfragmented_tombstones->value().size()); + } + // VectorIterator implicitly sorts by key during construction. + auto iter = std::unique_ptr( + new VectorIterator(std::move(keys), std::move(values), &icmp)); + FragmentTombstones(std::move(iter), icmp, for_compaction, snapshots); +} + +void FragmentedRangeTombstoneList::FragmentTombstones( + std::unique_ptr unfragmented_tombstones, + const InternalKeyComparator& icmp, bool for_compaction, + const std::vector& snapshots) { + Slice cur_start_key(nullptr, 0); + auto cmp = ParsedInternalKeyComparator(&icmp); + + // Stores the end keys and sequence numbers of range tombstones with a start + // key less than or equal to cur_start_key. Provides an ordering by end key + // for use in flush_current_tombstones. + std::set cur_end_keys(cmp); + + // Given the next start key in unfragmented_tombstones, + // flush_current_tombstones writes every tombstone fragment that starts + // and ends with a key before next_start_key, and starts with a key greater + // than or equal to cur_start_key. + auto flush_current_tombstones = [&](const Slice& next_start_key) { + auto it = cur_end_keys.begin(); + bool reached_next_start_key = false; + for (; it != cur_end_keys.end() && !reached_next_start_key; ++it) { + Slice cur_end_key = it->user_key; + if (icmp.user_comparator()->Compare(cur_start_key, cur_end_key) == 0) { + // Empty tombstone. + continue; + } + if (icmp.user_comparator()->Compare(next_start_key, cur_end_key) <= 0) { + // All of the end keys in [it, cur_end_keys.end()) are after + // next_start_key, so the tombstones they represent can be used in + // fragments that start with keys greater than or equal to + // next_start_key. However, the end keys we already passed will not be + // used in any more tombstone fragments. + // + // Remove the fully fragmented tombstones and stop iteration after a + // final round of flushing to preserve the tombstones we can create more + // fragments from. + reached_next_start_key = true; + cur_end_keys.erase(cur_end_keys.begin(), it); + cur_end_key = next_start_key; + } + + // Flush a range tombstone fragment [cur_start_key, cur_end_key), which + // should not overlap with the last-flushed tombstone fragment. + assert(tombstones_.empty() || + icmp.user_comparator()->Compare(tombstones_.back().end_key, + cur_start_key) <= 0); + + // Sort the sequence numbers of the tombstones being fragmented in + // descending order, and then flush them in that order. + autovector seqnums_to_flush; + for (auto flush_it = it; flush_it != cur_end_keys.end(); ++flush_it) { + seqnums_to_flush.push_back(flush_it->sequence); + } + std::sort(seqnums_to_flush.begin(), seqnums_to_flush.end(), + std::greater()); + + size_t start_idx = tombstone_seqs_.size(); + size_t end_idx = start_idx + seqnums_to_flush.size(); + + if (for_compaction) { + // Drop all tombstone seqnums that are not preserved by a snapshot. + SequenceNumber next_snapshot = kMaxSequenceNumber; + for (auto seq : seqnums_to_flush) { + if (seq <= next_snapshot) { + // This seqnum is visible by a lower snapshot. + tombstone_seqs_.push_back(seq); + seq_set_.insert(seq); + auto upper_bound_it = + std::lower_bound(snapshots.begin(), snapshots.end(), seq); + if (upper_bound_it == snapshots.begin()) { + // This seqnum is the topmost one visible by the earliest + // snapshot. None of the seqnums below it will be visible, so we + // can skip them. + break; + } + next_snapshot = *std::prev(upper_bound_it); + } + } + end_idx = tombstone_seqs_.size(); + } else { + // The fragmentation is being done for reads, so preserve all seqnums. + tombstone_seqs_.insert(tombstone_seqs_.end(), seqnums_to_flush.begin(), + seqnums_to_flush.end()); + seq_set_.insert(seqnums_to_flush.begin(), seqnums_to_flush.end()); + } + + assert(start_idx < end_idx); + tombstones_.emplace_back(cur_start_key, cur_end_key, start_idx, end_idx); + + cur_start_key = cur_end_key; + } + if (!reached_next_start_key) { + // There is a gap between the last flushed tombstone fragment and + // the next tombstone's start key. Remove all the end keys in + // the working set, since we have fully fragmented their corresponding + // tombstones. + cur_end_keys.clear(); + } + cur_start_key = next_start_key; + }; + + pinned_iters_mgr_.StartPinning(); + + bool no_tombstones = true; + for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid(); + unfragmented_tombstones->Next()) { + const Slice& ikey = unfragmented_tombstones->key(); + Slice tombstone_start_key = ExtractUserKey(ikey); + SequenceNumber tombstone_seq = GetInternalKeySeqno(ikey); + if (!unfragmented_tombstones->IsKeyPinned()) { + pinned_slices_.emplace_back(tombstone_start_key.data(), + tombstone_start_key.size()); + tombstone_start_key = pinned_slices_.back(); + } + no_tombstones = false; + + Slice tombstone_end_key = unfragmented_tombstones->value(); + if (!unfragmented_tombstones->IsValuePinned()) { + pinned_slices_.emplace_back(tombstone_end_key.data(), + tombstone_end_key.size()); + tombstone_end_key = pinned_slices_.back(); + } + if (!cur_end_keys.empty() && icmp.user_comparator()->Compare( + cur_start_key, tombstone_start_key) != 0) { + // The start key has changed. Flush all tombstones that start before + // this new start key. + flush_current_tombstones(tombstone_start_key); + } + cur_start_key = tombstone_start_key; + + cur_end_keys.emplace(tombstone_end_key, tombstone_seq, kTypeRangeDeletion); + } + if (!cur_end_keys.empty()) { + ParsedInternalKey last_end_key = *std::prev(cur_end_keys.end()); + flush_current_tombstones(last_end_key.user_key); + } + + if (!no_tombstones) { + pinned_iters_mgr_.PinIterator(unfragmented_tombstones.release(), + false /* arena */); + } +} + +bool FragmentedRangeTombstoneList::ContainsRange(SequenceNumber lower, + SequenceNumber upper) const { + auto seq_it = seq_set_.lower_bound(lower); + return seq_it != seq_set_.end() && *seq_it <= upper; +} + +FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator( + const FragmentedRangeTombstoneList* tombstones, + const InternalKeyComparator& icmp, SequenceNumber _upper_bound, + SequenceNumber _lower_bound) + : tombstone_start_cmp_(icmp.user_comparator()), + tombstone_end_cmp_(icmp.user_comparator()), + icmp_(&icmp), + ucmp_(icmp.user_comparator()), + tombstones_(tombstones), + upper_bound_(_upper_bound), + lower_bound_(_lower_bound) { + assert(tombstones_ != nullptr); + Invalidate(); +} + +FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator( + const std::shared_ptr& tombstones, + const InternalKeyComparator& icmp, SequenceNumber _upper_bound, + SequenceNumber _lower_bound) + : tombstone_start_cmp_(icmp.user_comparator()), + tombstone_end_cmp_(icmp.user_comparator()), + icmp_(&icmp), + ucmp_(icmp.user_comparator()), + tombstones_ref_(tombstones), + tombstones_(tombstones_ref_.get()), + upper_bound_(_upper_bound), + lower_bound_(_lower_bound) { + assert(tombstones_ != nullptr); + Invalidate(); +} + +void FragmentedRangeTombstoneIterator::SeekToFirst() { + pos_ = tombstones_->begin(); + seq_pos_ = tombstones_->seq_begin(); +} + +void FragmentedRangeTombstoneIterator::SeekToTopFirst() { + if (tombstones_->empty()) { + Invalidate(); + return; + } + pos_ = tombstones_->begin(); + seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx), + tombstones_->seq_iter(pos_->seq_end_idx), + upper_bound_, std::greater()); + ScanForwardToVisibleTombstone(); +} + +void FragmentedRangeTombstoneIterator::SeekToLast() { + pos_ = std::prev(tombstones_->end()); + seq_pos_ = std::prev(tombstones_->seq_end()); +} + +void FragmentedRangeTombstoneIterator::SeekToTopLast() { + if (tombstones_->empty()) { + Invalidate(); + return; + } + pos_ = std::prev(tombstones_->end()); + seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx), + tombstones_->seq_iter(pos_->seq_end_idx), + upper_bound_, std::greater()); + ScanBackwardToVisibleTombstone(); +} + +void FragmentedRangeTombstoneIterator::Seek(const Slice& target) { + if (tombstones_->empty()) { + Invalidate(); + return; + } + SeekToCoveringTombstone(target); + ScanForwardToVisibleTombstone(); +} + +void FragmentedRangeTombstoneIterator::SeekForPrev(const Slice& target) { + if (tombstones_->empty()) { + Invalidate(); + return; + } + SeekForPrevToCoveringTombstone(target); + ScanBackwardToVisibleTombstone(); +} + +void FragmentedRangeTombstoneIterator::SeekToCoveringTombstone( + const Slice& target) { + pos_ = std::upper_bound(tombstones_->begin(), tombstones_->end(), target, + tombstone_end_cmp_); + if (pos_ == tombstones_->end()) { + // All tombstones end before target. + seq_pos_ = tombstones_->seq_end(); + return; + } + seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx), + tombstones_->seq_iter(pos_->seq_end_idx), + upper_bound_, std::greater()); +} + +void FragmentedRangeTombstoneIterator::SeekForPrevToCoveringTombstone( + const Slice& target) { + if (tombstones_->empty()) { + Invalidate(); + return; + } + pos_ = std::upper_bound(tombstones_->begin(), tombstones_->end(), target, + tombstone_start_cmp_); + if (pos_ == tombstones_->begin()) { + // All tombstones start after target. + Invalidate(); + return; + } + --pos_; + seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx), + tombstones_->seq_iter(pos_->seq_end_idx), + upper_bound_, std::greater()); +} + +void FragmentedRangeTombstoneIterator::ScanForwardToVisibleTombstone() { + while (pos_ != tombstones_->end() && + (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx) || + *seq_pos_ < lower_bound_)) { + ++pos_; + if (pos_ == tombstones_->end()) { + Invalidate(); + return; + } + seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx), + tombstones_->seq_iter(pos_->seq_end_idx), + upper_bound_, std::greater()); + } +} + +void FragmentedRangeTombstoneIterator::ScanBackwardToVisibleTombstone() { + while (pos_ != tombstones_->end() && + (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx) || + *seq_pos_ < lower_bound_)) { + if (pos_ == tombstones_->begin()) { + Invalidate(); + return; + } + --pos_; + seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx), + tombstones_->seq_iter(pos_->seq_end_idx), + upper_bound_, std::greater()); + } +} + +void FragmentedRangeTombstoneIterator::Next() { + ++seq_pos_; + if (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx)) { + ++pos_; + } +} + +void FragmentedRangeTombstoneIterator::TopNext() { + ++pos_; + if (pos_ == tombstones_->end()) { + return; + } + seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx), + tombstones_->seq_iter(pos_->seq_end_idx), + upper_bound_, std::greater()); + ScanForwardToVisibleTombstone(); +} + +void FragmentedRangeTombstoneIterator::Prev() { + if (seq_pos_ == tombstones_->seq_begin()) { + Invalidate(); + return; + } + --seq_pos_; + if (pos_ == tombstones_->end() || + seq_pos_ == tombstones_->seq_iter(pos_->seq_start_idx - 1)) { + --pos_; + } +} + +void FragmentedRangeTombstoneIterator::TopPrev() { + if (pos_ == tombstones_->begin()) { + Invalidate(); + return; + } + --pos_; + seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx), + tombstones_->seq_iter(pos_->seq_end_idx), + upper_bound_, std::greater()); + ScanBackwardToVisibleTombstone(); +} + +bool FragmentedRangeTombstoneIterator::Valid() const { + return tombstones_ != nullptr && pos_ != tombstones_->end(); +} + +SequenceNumber FragmentedRangeTombstoneIterator::MaxCoveringTombstoneSeqnum( + const Slice& target_user_key) { + SeekToCoveringTombstone(target_user_key); + return ValidPos() && ucmp_->Compare(start_key(), target_user_key) <= 0 ? seq() + : 0; +} + +std::map> +FragmentedRangeTombstoneIterator::SplitBySnapshot( + const std::vector& snapshots) { + std::map> + splits; + SequenceNumber lower = 0; + SequenceNumber upper; + for (size_t i = 0; i <= snapshots.size(); i++) { + if (i >= snapshots.size()) { + upper = kMaxSequenceNumber; + } else { + upper = snapshots[i]; + } + if (tombstones_->ContainsRange(lower, upper)) { + splits.emplace(upper, std::unique_ptr( + new FragmentedRangeTombstoneIterator( + tombstones_, *icmp_, upper, lower))); + } + lower = upper + 1; + } + return splits; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/range_tombstone_fragmenter.h b/src/rocksdb/db/range_tombstone_fragmenter.h new file mode 100644 index 000000000..63ec24e64 --- /dev/null +++ b/src/rocksdb/db/range_tombstone_fragmenter.h @@ -0,0 +1,256 @@ +// Copyright (c) 2018-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "db/pinned_iterators_manager.h" +#include "rocksdb/status.h" +#include "table/internal_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +struct FragmentedRangeTombstoneList { + public: + // A compact representation of a "stack" of range tombstone fragments, which + // start and end at the same user keys but have different sequence numbers. + // The members seq_start_idx and seq_end_idx are intended to be parameters to + // seq_iter(). + struct RangeTombstoneStack { + RangeTombstoneStack(const Slice& start, const Slice& end, size_t start_idx, + size_t end_idx) + : start_key(start), + end_key(end), + seq_start_idx(start_idx), + seq_end_idx(end_idx) {} + + Slice start_key; + Slice end_key; + size_t seq_start_idx; + size_t seq_end_idx; + }; + FragmentedRangeTombstoneList( + std::unique_ptr unfragmented_tombstones, + const InternalKeyComparator& icmp, bool for_compaction = false, + const std::vector& snapshots = {}); + + std::vector::const_iterator begin() const { + return tombstones_.begin(); + } + + std::vector::const_iterator end() const { + return tombstones_.end(); + } + + std::vector::const_iterator seq_iter(size_t idx) const { + return std::next(tombstone_seqs_.begin(), idx); + } + + std::vector::const_iterator seq_begin() const { + return tombstone_seqs_.begin(); + } + + std::vector::const_iterator seq_end() const { + return tombstone_seqs_.end(); + } + + bool empty() const { return tombstones_.empty(); } + + // Returns true if the stored tombstones contain with one with a sequence + // number in [lower, upper]. + bool ContainsRange(SequenceNumber lower, SequenceNumber upper) const; + + private: + // Given an ordered range tombstone iterator unfragmented_tombstones, + // "fragment" the tombstones into non-overlapping pieces, and store them in + // tombstones_ and tombstone_seqs_. + void FragmentTombstones( + std::unique_ptr unfragmented_tombstones, + const InternalKeyComparator& icmp, bool for_compaction, + const std::vector& snapshots); + + std::vector tombstones_; + std::vector tombstone_seqs_; + std::set seq_set_; + std::list pinned_slices_; + PinnedIteratorsManager pinned_iters_mgr_; +}; + +// FragmentedRangeTombstoneIterator converts an InternalIterator of a range-del +// meta block into an iterator over non-overlapping tombstone fragments. The +// tombstone fragmentation process should be more efficient than the range +// tombstone collapsing algorithm in RangeDelAggregator because this leverages +// the internal key ordering already provided by the input iterator, if +// applicable (when the iterator is unsorted, a new sorted iterator is created +// before proceeding). If there are few overlaps, creating a +// FragmentedRangeTombstoneIterator should be O(n), while the RangeDelAggregator +// tombstone collapsing is always O(n log n). +class FragmentedRangeTombstoneIterator : public InternalIterator { + public: + FragmentedRangeTombstoneIterator( + const FragmentedRangeTombstoneList* tombstones, + const InternalKeyComparator& icmp, SequenceNumber upper_bound, + SequenceNumber lower_bound = 0); + FragmentedRangeTombstoneIterator( + const std::shared_ptr& tombstones, + const InternalKeyComparator& icmp, SequenceNumber upper_bound, + SequenceNumber lower_bound = 0); + + void SeekToFirst() override; + void SeekToLast() override; + + void SeekToTopFirst(); + void SeekToTopLast(); + + // NOTE: Seek and SeekForPrev do not behave in the way InternalIterator + // seeking should behave. This is OK because they are not currently used, but + // eventually FragmentedRangeTombstoneIterator should no longer implement + // InternalIterator. + // + // Seeks to the range tombstone that covers target at a seqnum in the + // snapshot. If no such tombstone exists, seek to the earliest tombstone in + // the snapshot that ends after target. + void Seek(const Slice& target) override; + // Seeks to the range tombstone that covers target at a seqnum in the + // snapshot. If no such tombstone exists, seek to the latest tombstone in the + // snapshot that starts before target. + void SeekForPrev(const Slice& target) override; + + void Next() override; + void Prev() override; + + void TopNext(); + void TopPrev(); + + bool Valid() const override; + Slice key() const override { + MaybePinKey(); + return current_start_key_.Encode(); + } + Slice value() const override { return pos_->end_key; } + bool IsKeyPinned() const override { return false; } + bool IsValuePinned() const override { return true; } + Status status() const override { return Status::OK(); } + + bool empty() const { return tombstones_->empty(); } + void Invalidate() { + pos_ = tombstones_->end(); + seq_pos_ = tombstones_->seq_end(); + pinned_pos_ = tombstones_->end(); + pinned_seq_pos_ = tombstones_->seq_end(); + } + + RangeTombstone Tombstone() const { + return RangeTombstone(start_key(), end_key(), seq()); + } + Slice start_key() const { return pos_->start_key; } + Slice end_key() const { return pos_->end_key; } + SequenceNumber seq() const { return *seq_pos_; } + ParsedInternalKey parsed_start_key() const { + return ParsedInternalKey(pos_->start_key, kMaxSequenceNumber, + kTypeRangeDeletion); + } + ParsedInternalKey parsed_end_key() const { + return ParsedInternalKey(pos_->end_key, kMaxSequenceNumber, + kTypeRangeDeletion); + } + + SequenceNumber MaxCoveringTombstoneSeqnum(const Slice& user_key); + + // Splits the iterator into n+1 iterators (where n is the number of + // snapshots), each providing a view over a "stripe" of sequence numbers. The + // iterators are keyed by the upper bound of their ranges (the provided + // snapshots + kMaxSequenceNumber). + // + // NOTE: the iterators in the returned map are no longer valid if their + // parent iterator is deleted, since they do not modify the refcount of the + // underlying tombstone list. Therefore, this map should be deleted before + // the parent iterator. + std::map> + SplitBySnapshot(const std::vector& snapshots); + + SequenceNumber upper_bound() const { return upper_bound_; } + SequenceNumber lower_bound() const { return lower_bound_; } + + private: + using RangeTombstoneStack = FragmentedRangeTombstoneList::RangeTombstoneStack; + + struct RangeTombstoneStackStartComparator { + explicit RangeTombstoneStackStartComparator(const Comparator* c) : cmp(c) {} + + bool operator()(const RangeTombstoneStack& a, + const RangeTombstoneStack& b) const { + return cmp->Compare(a.start_key, b.start_key) < 0; + } + + bool operator()(const RangeTombstoneStack& a, const Slice& b) const { + return cmp->Compare(a.start_key, b) < 0; + } + + bool operator()(const Slice& a, const RangeTombstoneStack& b) const { + return cmp->Compare(a, b.start_key) < 0; + } + + const Comparator* cmp; + }; + + struct RangeTombstoneStackEndComparator { + explicit RangeTombstoneStackEndComparator(const Comparator* c) : cmp(c) {} + + bool operator()(const RangeTombstoneStack& a, + const RangeTombstoneStack& b) const { + return cmp->Compare(a.end_key, b.end_key) < 0; + } + + bool operator()(const RangeTombstoneStack& a, const Slice& b) const { + return cmp->Compare(a.end_key, b) < 0; + } + + bool operator()(const Slice& a, const RangeTombstoneStack& b) const { + return cmp->Compare(a, b.end_key) < 0; + } + + const Comparator* cmp; + }; + + void MaybePinKey() const { + if (pos_ != tombstones_->end() && seq_pos_ != tombstones_->seq_end() && + (pinned_pos_ != pos_ || pinned_seq_pos_ != seq_pos_)) { + current_start_key_.Set(pos_->start_key, *seq_pos_, kTypeRangeDeletion); + pinned_pos_ = pos_; + pinned_seq_pos_ = seq_pos_; + } + } + + void SeekToCoveringTombstone(const Slice& key); + void SeekForPrevToCoveringTombstone(const Slice& key); + void ScanForwardToVisibleTombstone(); + void ScanBackwardToVisibleTombstone(); + bool ValidPos() const { + return Valid() && seq_pos_ != tombstones_->seq_iter(pos_->seq_end_idx); + } + + const RangeTombstoneStackStartComparator tombstone_start_cmp_; + const RangeTombstoneStackEndComparator tombstone_end_cmp_; + const InternalKeyComparator* icmp_; + const Comparator* ucmp_; + std::shared_ptr tombstones_ref_; + const FragmentedRangeTombstoneList* tombstones_; + SequenceNumber upper_bound_; + SequenceNumber lower_bound_; + std::vector::const_iterator pos_; + std::vector::const_iterator seq_pos_; + mutable std::vector::const_iterator pinned_pos_; + mutable std::vector::const_iterator pinned_seq_pos_; + mutable InternalKey current_start_key_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/range_tombstone_fragmenter_test.cc b/src/rocksdb/db/range_tombstone_fragmenter_test.cc new file mode 100644 index 000000000..56234b1dd --- /dev/null +++ b/src/rocksdb/db/range_tombstone_fragmenter_test.cc @@ -0,0 +1,552 @@ +// Copyright (c) 2018-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/range_tombstone_fragmenter.h" + +#include "db/db_test_util.h" +#include "rocksdb/comparator.h" +#include "test_util/testutil.h" + +namespace ROCKSDB_NAMESPACE { + +class RangeTombstoneFragmenterTest : public testing::Test {}; + +namespace { + +static auto bytewise_icmp = InternalKeyComparator(BytewiseComparator()); + +std::unique_ptr MakeRangeDelIter( + const std::vector& range_dels) { + std::vector keys, values; + for (const auto& range_del : range_dels) { + auto key_and_value = range_del.Serialize(); + keys.push_back(key_and_value.first.Encode().ToString()); + values.push_back(key_and_value.second.ToString()); + } + return std::unique_ptr( + new test::VectorIterator(keys, values)); +} + +void CheckIterPosition(const RangeTombstone& tombstone, + const FragmentedRangeTombstoneIterator* iter) { + // Test InternalIterator interface. + EXPECT_EQ(tombstone.start_key_, ExtractUserKey(iter->key())); + EXPECT_EQ(tombstone.end_key_, iter->value()); + EXPECT_EQ(tombstone.seq_, iter->seq()); + + // Test FragmentedRangeTombstoneIterator interface. + EXPECT_EQ(tombstone.start_key_, iter->start_key()); + EXPECT_EQ(tombstone.end_key_, iter->end_key()); + EXPECT_EQ(tombstone.seq_, GetInternalKeySeqno(iter->key())); +} + +void VerifyFragmentedRangeDels( + FragmentedRangeTombstoneIterator* iter, + const std::vector& expected_tombstones) { + iter->SeekToFirst(); + for (size_t i = 0; i < expected_tombstones.size(); i++, iter->Next()) { + ASSERT_TRUE(iter->Valid()); + CheckIterPosition(expected_tombstones[i], iter); + } + EXPECT_FALSE(iter->Valid()); +} + +void VerifyVisibleTombstones( + FragmentedRangeTombstoneIterator* iter, + const std::vector& expected_tombstones) { + iter->SeekToTopFirst(); + for (size_t i = 0; i < expected_tombstones.size(); i++, iter->TopNext()) { + ASSERT_TRUE(iter->Valid()); + CheckIterPosition(expected_tombstones[i], iter); + } + EXPECT_FALSE(iter->Valid()); +} + +struct SeekTestCase { + Slice seek_target; + RangeTombstone expected_position; + bool out_of_range; +}; + +void VerifySeek(FragmentedRangeTombstoneIterator* iter, + const std::vector& cases) { + for (const auto& testcase : cases) { + iter->Seek(testcase.seek_target); + if (testcase.out_of_range) { + ASSERT_FALSE(iter->Valid()); + } else { + ASSERT_TRUE(iter->Valid()); + CheckIterPosition(testcase.expected_position, iter); + } + } +} + +void VerifySeekForPrev(FragmentedRangeTombstoneIterator* iter, + const std::vector& cases) { + for (const auto& testcase : cases) { + iter->SeekForPrev(testcase.seek_target); + if (testcase.out_of_range) { + ASSERT_FALSE(iter->Valid()); + } else { + ASSERT_TRUE(iter->Valid()); + CheckIterPosition(testcase.expected_position, iter); + } + } +} + +struct MaxCoveringTombstoneSeqnumTestCase { + Slice user_key; + SequenceNumber result; +}; + +void VerifyMaxCoveringTombstoneSeqnum( + FragmentedRangeTombstoneIterator* iter, + const std::vector& cases) { + for (const auto& testcase : cases) { + EXPECT_EQ(testcase.result, + iter->MaxCoveringTombstoneSeqnum(testcase.user_key)); + } +} + +} // anonymous namespace + +TEST_F(RangeTombstoneFragmenterTest, NonOverlappingTombstones) { + auto range_del_iter = MakeRangeDelIter({{"a", "b", 10}, {"c", "d", 5}}); + + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp, + kMaxSequenceNumber); + ASSERT_EQ(0, iter.lower_bound()); + ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound()); + VerifyFragmentedRangeDels(&iter, {{"a", "b", 10}, {"c", "d", 5}}); + VerifyMaxCoveringTombstoneSeqnum(&iter, + {{"", 0}, {"a", 10}, {"b", 0}, {"c", 5}}); +} + +TEST_F(RangeTombstoneFragmenterTest, OverlappingTombstones) { + auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, {"c", "g", 15}}); + + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp, + kMaxSequenceNumber); + ASSERT_EQ(0, iter.lower_bound()); + ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound()); + VerifyFragmentedRangeDels( + &iter, {{"a", "c", 10}, {"c", "e", 15}, {"c", "e", 10}, {"e", "g", 15}}); + VerifyMaxCoveringTombstoneSeqnum(&iter, + {{"a", 10}, {"c", 15}, {"e", 15}, {"g", 0}}); +} + +TEST_F(RangeTombstoneFragmenterTest, ContiguousTombstones) { + auto range_del_iter = MakeRangeDelIter( + {{"a", "c", 10}, {"c", "e", 20}, {"c", "e", 5}, {"e", "g", 15}}); + + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp, + kMaxSequenceNumber); + ASSERT_EQ(0, iter.lower_bound()); + ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound()); + VerifyFragmentedRangeDels( + &iter, {{"a", "c", 10}, {"c", "e", 20}, {"c", "e", 5}, {"e", "g", 15}}); + VerifyMaxCoveringTombstoneSeqnum(&iter, + {{"a", 10}, {"c", 20}, {"e", 15}, {"g", 0}}); +} + +TEST_F(RangeTombstoneFragmenterTest, RepeatedStartAndEndKey) { + auto range_del_iter = + MakeRangeDelIter({{"a", "c", 10}, {"a", "c", 7}, {"a", "c", 3}}); + + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp, + kMaxSequenceNumber); + ASSERT_EQ(0, iter.lower_bound()); + ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound()); + VerifyFragmentedRangeDels(&iter, + {{"a", "c", 10}, {"a", "c", 7}, {"a", "c", 3}}); + VerifyMaxCoveringTombstoneSeqnum(&iter, {{"a", 10}, {"b", 10}, {"c", 0}}); +} + +TEST_F(RangeTombstoneFragmenterTest, RepeatedStartKeyDifferentEndKeys) { + auto range_del_iter = + MakeRangeDelIter({{"a", "e", 10}, {"a", "g", 7}, {"a", "c", 3}}); + + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp, + kMaxSequenceNumber); + ASSERT_EQ(0, iter.lower_bound()); + ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound()); + VerifyFragmentedRangeDels(&iter, {{"a", "c", 10}, + {"a", "c", 7}, + {"a", "c", 3}, + {"c", "e", 10}, + {"c", "e", 7}, + {"e", "g", 7}}); + VerifyMaxCoveringTombstoneSeqnum(&iter, + {{"a", 10}, {"c", 10}, {"e", 7}, {"g", 0}}); +} + +TEST_F(RangeTombstoneFragmenterTest, RepeatedStartKeyMixedEndKeys) { + auto range_del_iter = MakeRangeDelIter({{"a", "c", 30}, + {"a", "g", 20}, + {"a", "e", 10}, + {"a", "g", 7}, + {"a", "c", 3}}); + + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp, + kMaxSequenceNumber); + ASSERT_EQ(0, iter.lower_bound()); + ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound()); + VerifyFragmentedRangeDels(&iter, {{"a", "c", 30}, + {"a", "c", 20}, + {"a", "c", 10}, + {"a", "c", 7}, + {"a", "c", 3}, + {"c", "e", 20}, + {"c", "e", 10}, + {"c", "e", 7}, + {"e", "g", 20}, + {"e", "g", 7}}); + VerifyMaxCoveringTombstoneSeqnum(&iter, + {{"a", 30}, {"c", 20}, {"e", 20}, {"g", 0}}); +} + +TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKey) { + auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, + {"c", "g", 8}, + {"c", "i", 6}, + {"j", "n", 4}, + {"j", "l", 2}}); + + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp, + kMaxSequenceNumber); + FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp, + 9 /* upper_bound */); + FragmentedRangeTombstoneIterator iter3(&fragment_list, bytewise_icmp, + 7 /* upper_bound */); + FragmentedRangeTombstoneIterator iter4(&fragment_list, bytewise_icmp, + 5 /* upper_bound */); + FragmentedRangeTombstoneIterator iter5(&fragment_list, bytewise_icmp, + 3 /* upper_bound */); + for (auto* iter : {&iter1, &iter2, &iter3, &iter4, &iter5}) { + VerifyFragmentedRangeDels(iter, {{"a", "c", 10}, + {"c", "e", 10}, + {"c", "e", 8}, + {"c", "e", 6}, + {"e", "g", 8}, + {"e", "g", 6}, + {"g", "i", 6}, + {"j", "l", 4}, + {"j", "l", 2}, + {"l", "n", 4}}); + } + + ASSERT_EQ(0, iter1.lower_bound()); + ASSERT_EQ(kMaxSequenceNumber, iter1.upper_bound()); + VerifyVisibleTombstones(&iter1, {{"a", "c", 10}, + {"c", "e", 10}, + {"e", "g", 8}, + {"g", "i", 6}, + {"j", "l", 4}, + {"l", "n", 4}}); + VerifyMaxCoveringTombstoneSeqnum( + &iter1, {{"a", 10}, {"c", 10}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}}); + + ASSERT_EQ(0, iter2.lower_bound()); + ASSERT_EQ(9, iter2.upper_bound()); + VerifyVisibleTombstones(&iter2, {{"c", "e", 8}, + {"e", "g", 8}, + {"g", "i", 6}, + {"j", "l", 4}, + {"l", "n", 4}}); + VerifyMaxCoveringTombstoneSeqnum( + &iter2, {{"a", 0}, {"c", 8}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}}); + + ASSERT_EQ(0, iter3.lower_bound()); + ASSERT_EQ(7, iter3.upper_bound()); + VerifyVisibleTombstones(&iter3, {{"c", "e", 6}, + {"e", "g", 6}, + {"g", "i", 6}, + {"j", "l", 4}, + {"l", "n", 4}}); + VerifyMaxCoveringTombstoneSeqnum( + &iter3, {{"a", 0}, {"c", 6}, {"e", 6}, {"i", 0}, {"j", 4}, {"m", 4}}); + + ASSERT_EQ(0, iter4.lower_bound()); + ASSERT_EQ(5, iter4.upper_bound()); + VerifyVisibleTombstones(&iter4, {{"j", "l", 4}, {"l", "n", 4}}); + VerifyMaxCoveringTombstoneSeqnum( + &iter4, {{"a", 0}, {"c", 0}, {"e", 0}, {"i", 0}, {"j", 4}, {"m", 4}}); + + ASSERT_EQ(0, iter5.lower_bound()); + ASSERT_EQ(3, iter5.upper_bound()); + VerifyVisibleTombstones(&iter5, {{"j", "l", 2}}); + VerifyMaxCoveringTombstoneSeqnum( + &iter5, {{"a", 0}, {"c", 0}, {"e", 0}, {"i", 0}, {"j", 2}, {"m", 0}}); +} + +TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKeyUnordered) { + auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, + {"j", "n", 4}, + {"c", "i", 6}, + {"c", "g", 8}, + {"j", "l", 2}}); + + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp, + 9 /* upper_bound */); + ASSERT_EQ(0, iter.lower_bound()); + ASSERT_EQ(9, iter.upper_bound()); + VerifyFragmentedRangeDels(&iter, {{"a", "c", 10}, + {"c", "e", 10}, + {"c", "e", 8}, + {"c", "e", 6}, + {"e", "g", 8}, + {"e", "g", 6}, + {"g", "i", 6}, + {"j", "l", 4}, + {"j", "l", 2}, + {"l", "n", 4}}); + VerifyMaxCoveringTombstoneSeqnum( + &iter, {{"a", 0}, {"c", 8}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}}); +} + +TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKeyForCompaction) { + auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, + {"j", "n", 4}, + {"c", "i", 6}, + {"c", "g", 8}, + {"j", "l", 2}}); + + FragmentedRangeTombstoneList fragment_list( + std::move(range_del_iter), bytewise_icmp, true /* for_compaction */, + {} /* snapshots */); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp, + kMaxSequenceNumber /* upper_bound */); + VerifyFragmentedRangeDels(&iter, {{"a", "c", 10}, + {"c", "e", 10}, + {"e", "g", 8}, + {"g", "i", 6}, + {"j", "l", 4}, + {"l", "n", 4}}); +} + +TEST_F(RangeTombstoneFragmenterTest, + OverlapAndRepeatedStartKeyForCompactionWithSnapshot) { + auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, + {"j", "n", 4}, + {"c", "i", 6}, + {"c", "g", 8}, + {"j", "l", 2}}); + + FragmentedRangeTombstoneList fragment_list( + std::move(range_del_iter), bytewise_icmp, true /* for_compaction */, + {20, 9} /* upper_bounds */); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp, + kMaxSequenceNumber /* upper_bound */); + VerifyFragmentedRangeDels(&iter, {{"a", "c", 10}, + {"c", "e", 10}, + {"c", "e", 8}, + {"e", "g", 8}, + {"g", "i", 6}, + {"j", "l", 4}, + {"l", "n", 4}}); +} + +TEST_F(RangeTombstoneFragmenterTest, IteratorSplitNoSnapshots) { + auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, + {"j", "n", 4}, + {"c", "i", 6}, + {"c", "g", 8}, + {"j", "l", 2}}); + + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp, + kMaxSequenceNumber /* upper_bound */); + + auto split_iters = iter.SplitBySnapshot({} /* snapshots */); + ASSERT_EQ(1, split_iters.size()); + + auto* split_iter = split_iters[kMaxSequenceNumber].get(); + ASSERT_EQ(0, split_iter->lower_bound()); + ASSERT_EQ(kMaxSequenceNumber, split_iter->upper_bound()); + VerifyVisibleTombstones(split_iter, {{"a", "c", 10}, + {"c", "e", 10}, + {"e", "g", 8}, + {"g", "i", 6}, + {"j", "l", 4}, + {"l", "n", 4}}); +} + +TEST_F(RangeTombstoneFragmenterTest, IteratorSplitWithSnapshots) { + auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, + {"j", "n", 4}, + {"c", "i", 6}, + {"c", "g", 8}, + {"j", "l", 2}}); + + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp, + kMaxSequenceNumber /* upper_bound */); + + auto split_iters = iter.SplitBySnapshot({3, 5, 7, 9} /* snapshots */); + ASSERT_EQ(5, split_iters.size()); + + auto* split_iter1 = split_iters[3].get(); + ASSERT_EQ(0, split_iter1->lower_bound()); + ASSERT_EQ(3, split_iter1->upper_bound()); + VerifyVisibleTombstones(split_iter1, {{"j", "l", 2}}); + + auto* split_iter2 = split_iters[5].get(); + ASSERT_EQ(4, split_iter2->lower_bound()); + ASSERT_EQ(5, split_iter2->upper_bound()); + VerifyVisibleTombstones(split_iter2, {{"j", "l", 4}, {"l", "n", 4}}); + + auto* split_iter3 = split_iters[7].get(); + ASSERT_EQ(6, split_iter3->lower_bound()); + ASSERT_EQ(7, split_iter3->upper_bound()); + VerifyVisibleTombstones(split_iter3, + {{"c", "e", 6}, {"e", "g", 6}, {"g", "i", 6}}); + + auto* split_iter4 = split_iters[9].get(); + ASSERT_EQ(8, split_iter4->lower_bound()); + ASSERT_EQ(9, split_iter4->upper_bound()); + VerifyVisibleTombstones(split_iter4, {{"c", "e", 8}, {"e", "g", 8}}); + + auto* split_iter5 = split_iters[kMaxSequenceNumber].get(); + ASSERT_EQ(10, split_iter5->lower_bound()); + ASSERT_EQ(kMaxSequenceNumber, split_iter5->upper_bound()); + VerifyVisibleTombstones(split_iter5, {{"a", "c", 10}, {"c", "e", 10}}); +} + +TEST_F(RangeTombstoneFragmenterTest, SeekStartKey) { + // Same tombstones as OverlapAndRepeatedStartKey. + auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, + {"c", "g", 8}, + {"c", "i", 6}, + {"j", "n", 4}, + {"j", "l", 2}}); + + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + + FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp, + kMaxSequenceNumber); + VerifySeek( + &iter1, + {{"a", {"a", "c", 10}}, {"e", {"e", "g", 8}}, {"l", {"l", "n", 4}}}); + VerifySeekForPrev( + &iter1, + {{"a", {"a", "c", 10}}, {"e", {"e", "g", 8}}, {"l", {"l", "n", 4}}}); + + FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp, + 3 /* upper_bound */); + VerifySeek(&iter2, {{"a", {"j", "l", 2}}, + {"e", {"j", "l", 2}}, + {"l", {}, true /* out of range */}}); + VerifySeekForPrev(&iter2, {{"a", {}, true /* out of range */}, + {"e", {}, true /* out of range */}, + {"l", {"j", "l", 2}}}); +} + +TEST_F(RangeTombstoneFragmenterTest, SeekCovered) { + // Same tombstones as OverlapAndRepeatedStartKey. + auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, + {"c", "g", 8}, + {"c", "i", 6}, + {"j", "n", 4}, + {"j", "l", 2}}); + + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + + FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp, + kMaxSequenceNumber); + VerifySeek( + &iter1, + {{"b", {"a", "c", 10}}, {"f", {"e", "g", 8}}, {"m", {"l", "n", 4}}}); + VerifySeekForPrev( + &iter1, + {{"b", {"a", "c", 10}}, {"f", {"e", "g", 8}}, {"m", {"l", "n", 4}}}); + + FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp, + 3 /* upper_bound */); + VerifySeek(&iter2, {{"b", {"j", "l", 2}}, + {"f", {"j", "l", 2}}, + {"m", {}, true /* out of range */}}); + VerifySeekForPrev(&iter2, {{"b", {}, true /* out of range */}, + {"f", {}, true /* out of range */}, + {"m", {"j", "l", 2}}}); +} + +TEST_F(RangeTombstoneFragmenterTest, SeekEndKey) { + // Same tombstones as OverlapAndRepeatedStartKey. + auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, + {"c", "g", 8}, + {"c", "i", 6}, + {"j", "n", 4}, + {"j", "l", 2}}); + + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + + FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp, + kMaxSequenceNumber); + VerifySeek(&iter1, {{"c", {"c", "e", 10}}, + {"g", {"g", "i", 6}}, + {"i", {"j", "l", 4}}, + {"n", {}, true /* out of range */}}); + VerifySeekForPrev(&iter1, {{"c", {"c", "e", 10}}, + {"g", {"g", "i", 6}}, + {"i", {"g", "i", 6}}, + {"n", {"l", "n", 4}}}); + + FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp, + 3 /* upper_bound */); + VerifySeek(&iter2, {{"c", {"j", "l", 2}}, + {"g", {"j", "l", 2}}, + {"i", {"j", "l", 2}}, + {"n", {}, true /* out of range */}}); + VerifySeekForPrev(&iter2, {{"c", {}, true /* out of range */}, + {"g", {}, true /* out of range */}, + {"i", {}, true /* out of range */}, + {"n", {"j", "l", 2}}}); +} + +TEST_F(RangeTombstoneFragmenterTest, SeekOutOfBounds) { + // Same tombstones as OverlapAndRepeatedStartKey. + auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, + {"c", "g", 8}, + {"c", "i", 6}, + {"j", "n", 4}, + {"j", "l", 2}}); + + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp, + kMaxSequenceNumber); + VerifySeek(&iter, {{"", {"a", "c", 10}}, {"z", {}, true /* out of range */}}); + VerifySeekForPrev(&iter, + {{"", {}, true /* out of range */}, {"z", {"l", "n", 4}}}); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/read_callback.h b/src/rocksdb/db/read_callback.h new file mode 100644 index 000000000..fbef1dd0d --- /dev/null +++ b/src/rocksdb/db/read_callback.h @@ -0,0 +1,53 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +class ReadCallback { + public: + ReadCallback(SequenceNumber last_visible_seq) + : max_visible_seq_(last_visible_seq) {} + ReadCallback(SequenceNumber last_visible_seq, SequenceNumber min_uncommitted) + : max_visible_seq_(last_visible_seq), min_uncommitted_(min_uncommitted) {} + + virtual ~ReadCallback() {} + + // Will be called to see if the seq number visible; if not it moves on to + // the next seq number. + virtual bool IsVisibleFullCheck(SequenceNumber seq) = 0; + + inline bool IsVisible(SequenceNumber seq) { + assert(min_uncommitted_ > 0); + assert(min_uncommitted_ >= kMinUnCommittedSeq); + if (seq < min_uncommitted_) { // handles seq == 0 as well + assert(seq <= max_visible_seq_); + return true; + } else if (max_visible_seq_ < seq) { + assert(seq != 0); + return false; + } else { + assert(seq != 0); // already handled in the first if-then clause + return IsVisibleFullCheck(seq); + } + } + + inline SequenceNumber max_visible_seq() { return max_visible_seq_; } + + // Refresh to a more recent visible seq + virtual void Refresh(SequenceNumber seq) { max_visible_seq_ = seq; } + + protected: + // The max visible seq, it is usually the snapshot but could be larger if + // transaction has its own writes written to db. + SequenceNumber max_visible_seq_ = kMaxSequenceNumber; + // Any seq less than min_uncommitted_ is committed. + const SequenceNumber min_uncommitted_ = kMinUnCommittedSeq; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/repair.cc b/src/rocksdb/db/repair.cc new file mode 100644 index 000000000..383ffe3a4 --- /dev/null +++ b/src/rocksdb/db/repair.cc @@ -0,0 +1,691 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Repairer does best effort recovery to recover as much data as possible after +// a disaster without compromising consistency. It does not guarantee bringing +// the database to a time consistent state. +// +// Repair process is broken into 4 phases: +// (a) Find files +// (b) Convert logs to tables +// (c) Extract metadata +// (d) Write Descriptor +// +// (a) Find files +// +// The repairer goes through all the files in the directory, and classifies them +// based on their file name. Any file that cannot be identified by name will be +// ignored. +// +// (b) Convert logs to table +// +// Every log file that is active is replayed. All sections of the file where the +// checksum does not match is skipped over. We intentionally give preference to +// data consistency. +// +// (c) Extract metadata +// +// We scan every table to compute +// (1) smallest/largest for the table +// (2) largest sequence number in the table +// (3) oldest blob file referred to by the table (if applicable) +// +// If we are unable to scan the file, then we ignore the table. +// +// (d) Write Descriptor +// +// We generate descriptor contents: +// - log number is set to zero +// - next-file-number is set to 1 + largest file number we found +// - last-sequence-number is set to largest sequence# found across +// all tables (see 2c) +// - compaction pointers are cleared +// - every table file is added at level 0 +// +// Possible optimization 1: +// (a) Compute total size and use to pick appropriate max-level M +// (b) Sort tables by largest sequence# in the table +// (c) For each table: if it overlaps earlier table, place in level-0, +// else place in level-M. +// (d) We can provide options for time consistent recovery and unsafe recovery +// (ignore checksum failure when applicable) +// Possible optimization 2: +// Store per-table metadata (smallest, largest, largest-seq#, ...) +// in the table's meta section to speed up ScanTable. + +#ifndef ROCKSDB_LITE + +#include +#include "db/builder.h" +#include "db/db_impl/db_impl.h" +#include "db/dbformat.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/table_cache.h" +#include "db/version_edit.h" +#include "db/write_batch_internal.h" +#include "env/composite_env_wrapper.h" +#include "file/filename.h" +#include "file/writable_file_writer.h" +#include "options/cf_options.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/write_buffer_manager.h" +#include "table/scoped_arena_iterator.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +class Repairer { + public: + Repairer(const std::string& dbname, const DBOptions& db_options, + const std::vector& column_families, + const ColumnFamilyOptions& default_cf_opts, + const ColumnFamilyOptions& unknown_cf_opts, bool create_unknown_cfs) + : dbname_(dbname), + env_(db_options.env), + env_options_(), + db_options_(SanitizeOptions(dbname_, db_options)), + immutable_db_options_(ImmutableDBOptions(db_options_)), + icmp_(default_cf_opts.comparator), + default_cf_opts_( + SanitizeOptions(immutable_db_options_, default_cf_opts)), + default_cf_iopts_( + ImmutableCFOptions(immutable_db_options_, default_cf_opts_)), + unknown_cf_opts_( + SanitizeOptions(immutable_db_options_, unknown_cf_opts)), + create_unknown_cfs_(create_unknown_cfs), + raw_table_cache_( + // TableCache can be small since we expect each table to be opened + // once. + NewLRUCache(10, db_options_.table_cache_numshardbits)), + table_cache_(new TableCache(default_cf_iopts_, env_options_, + raw_table_cache_.get(), + /*block_cache_tracer=*/nullptr)), + wb_(db_options_.db_write_buffer_size), + wc_(db_options_.delayed_write_rate), + vset_(dbname_, &immutable_db_options_, env_options_, + raw_table_cache_.get(), &wb_, &wc_, + /*block_cache_tracer=*/nullptr), + next_file_number_(1), + db_lock_(nullptr) { + for (const auto& cfd : column_families) { + cf_name_to_opts_[cfd.name] = cfd.options; + } + } + + const ColumnFamilyOptions* GetColumnFamilyOptions( + const std::string& cf_name) { + if (cf_name_to_opts_.find(cf_name) == cf_name_to_opts_.end()) { + if (create_unknown_cfs_) { + return &unknown_cf_opts_; + } + return nullptr; + } + return &cf_name_to_opts_[cf_name]; + } + + // Adds a column family to the VersionSet with cf_options_ and updates + // manifest. + Status AddColumnFamily(const std::string& cf_name, uint32_t cf_id) { + const auto* cf_opts = GetColumnFamilyOptions(cf_name); + if (cf_opts == nullptr) { + return Status::Corruption("Encountered unknown column family with name=" + + cf_name + ", id=" + ToString(cf_id)); + } + Options opts(db_options_, *cf_opts); + MutableCFOptions mut_cf_opts(opts); + + VersionEdit edit; + edit.SetComparatorName(opts.comparator->Name()); + edit.SetLogNumber(0); + edit.SetColumnFamily(cf_id); + ColumnFamilyData* cfd; + cfd = nullptr; + edit.AddColumnFamily(cf_name); + + mutex_.Lock(); + Status status = vset_.LogAndApply(cfd, mut_cf_opts, &edit, &mutex_, + nullptr /* db_directory */, + false /* new_descriptor_log */, cf_opts); + mutex_.Unlock(); + return status; + } + + ~Repairer() { + if (db_lock_ != nullptr) { + env_->UnlockFile(db_lock_); + } + delete table_cache_; + } + + Status Run() { + Status status = env_->LockFile(LockFileName(dbname_), &db_lock_); + if (!status.ok()) { + return status; + } + status = FindFiles(); + if (status.ok()) { + // Discard older manifests and start a fresh one + for (size_t i = 0; i < manifests_.size(); i++) { + ArchiveFile(dbname_ + "/" + manifests_[i]); + } + // Just create a DBImpl temporarily so we can reuse NewDB() + DBImpl* db_impl = new DBImpl(db_options_, dbname_); + status = db_impl->NewDB(); + delete db_impl; + } + + if (status.ok()) { + // Recover using the fresh manifest created by NewDB() + status = + vset_.Recover({{kDefaultColumnFamilyName, default_cf_opts_}}, false); + } + if (status.ok()) { + // Need to scan existing SST files first so the column families are + // created before we process WAL files + ExtractMetaData(); + + // ExtractMetaData() uses table_fds_ to know which SST files' metadata to + // extract -- we need to clear it here since metadata for existing SST + // files has been extracted already + table_fds_.clear(); + ConvertLogFilesToTables(); + ExtractMetaData(); + status = AddTables(); + } + if (status.ok()) { + uint64_t bytes = 0; + for (size_t i = 0; i < tables_.size(); i++) { + bytes += tables_[i].meta.fd.GetFileSize(); + } + ROCKS_LOG_WARN(db_options_.info_log, + "**** Repaired rocksdb %s; " + "recovered %" ROCKSDB_PRIszt " files; %" PRIu64 + " bytes. " + "Some data may have been lost. " + "****", + dbname_.c_str(), tables_.size(), bytes); + } + return status; + } + + private: + struct TableInfo { + FileMetaData meta; + uint32_t column_family_id; + std::string column_family_name; + }; + + std::string const dbname_; + Env* const env_; + const EnvOptions env_options_; + const DBOptions db_options_; + const ImmutableDBOptions immutable_db_options_; + const InternalKeyComparator icmp_; + const ColumnFamilyOptions default_cf_opts_; + const ImmutableCFOptions default_cf_iopts_; // table_cache_ holds reference + const ColumnFamilyOptions unknown_cf_opts_; + const bool create_unknown_cfs_; + std::shared_ptr raw_table_cache_; + TableCache* table_cache_; + WriteBufferManager wb_; + WriteController wc_; + VersionSet vset_; + std::unordered_map cf_name_to_opts_; + InstrumentedMutex mutex_; + + std::vector manifests_; + std::vector table_fds_; + std::vector logs_; + std::vector tables_; + uint64_t next_file_number_; + // Lock over the persistent DB state. Non-nullptr iff successfully + // acquired. + FileLock* db_lock_; + + Status FindFiles() { + std::vector filenames; + bool found_file = false; + std::vector to_search_paths; + + for (size_t path_id = 0; path_id < db_options_.db_paths.size(); path_id++) { + to_search_paths.push_back(db_options_.db_paths[path_id].path); + } + + // search wal_dir if user uses a customize wal_dir + bool same = false; + Status status = env_->AreFilesSame(db_options_.wal_dir, dbname_, &same); + if (status.IsNotSupported()) { + same = db_options_.wal_dir == dbname_; + status = Status::OK(); + } else if (!status.ok()) { + return status; + } + + if (!same) { + to_search_paths.push_back(db_options_.wal_dir); + } + + for (size_t path_id = 0; path_id < to_search_paths.size(); path_id++) { + status = env_->GetChildren(to_search_paths[path_id], &filenames); + if (!status.ok()) { + return status; + } + if (!filenames.empty()) { + found_file = true; + } + + uint64_t number; + FileType type; + for (size_t i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &type)) { + if (type == kDescriptorFile) { + manifests_.push_back(filenames[i]); + } else { + if (number + 1 > next_file_number_) { + next_file_number_ = number + 1; + } + if (type == kLogFile) { + logs_.push_back(number); + } else if (type == kTableFile) { + table_fds_.emplace_back(number, static_cast(path_id), + 0); + } else { + // Ignore other files + } + } + } + } + } + if (!found_file) { + return Status::Corruption(dbname_, "repair found no files"); + } + return Status::OK(); + } + + void ConvertLogFilesToTables() { + for (size_t i = 0; i < logs_.size(); i++) { + // we should use LogFileName(wal_dir, logs_[i]) here. user might uses wal_dir option. + std::string logname = LogFileName(db_options_.wal_dir, logs_[i]); + Status status = ConvertLogToTable(logs_[i]); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Log #%" PRIu64 ": ignoring conversion error: %s", + logs_[i], status.ToString().c_str()); + } + ArchiveFile(logname); + } + } + + Status ConvertLogToTable(uint64_t log) { + struct LogReporter : public log::Reader::Reporter { + Env* env; + std::shared_ptr info_log; + uint64_t lognum; + void Corruption(size_t bytes, const Status& s) override { + // We print error messages for corruption, but continue repairing. + ROCKS_LOG_ERROR(info_log, "Log #%" PRIu64 ": dropping %d bytes; %s", + lognum, static_cast(bytes), s.ToString().c_str()); + } + }; + + // Open the log file + std::string logname = LogFileName(db_options_.wal_dir, log); + std::unique_ptr lfile; + Status status = env_->NewSequentialFile( + logname, &lfile, env_->OptimizeForLogRead(env_options_)); + if (!status.ok()) { + return status; + } + std::unique_ptr lfile_reader(new SequentialFileReader( + NewLegacySequentialFileWrapper(lfile), logname)); + + // Create the log reader. + LogReporter reporter; + reporter.env = env_; + reporter.info_log = db_options_.info_log; + reporter.lognum = log; + // We intentionally make log::Reader do checksumming so that + // corruptions cause entire commits to be skipped instead of + // propagating bad information (like overly large sequence + // numbers). + log::Reader reader(db_options_.info_log, std::move(lfile_reader), &reporter, + true /*enable checksum*/, log); + + // Initialize per-column family memtables + for (auto* cfd : *vset_.GetColumnFamilySet()) { + cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(), + kMaxSequenceNumber); + } + auto cf_mems = new ColumnFamilyMemTablesImpl(vset_.GetColumnFamilySet()); + + // Read all the records and add to a memtable + std::string scratch; + Slice record; + WriteBatch batch; + int counter = 0; + while (reader.ReadRecord(&record, &scratch)) { + if (record.size() < WriteBatchInternal::kHeader) { + reporter.Corruption( + record.size(), Status::Corruption("log record too small")); + continue; + } + WriteBatchInternal::SetContents(&batch, record); + status = + WriteBatchInternal::InsertInto(&batch, cf_mems, nullptr, nullptr); + if (status.ok()) { + counter += WriteBatchInternal::Count(&batch); + } else { + ROCKS_LOG_WARN(db_options_.info_log, "Log #%" PRIu64 ": ignoring %s", + log, status.ToString().c_str()); + status = Status::OK(); // Keep going with rest of file + } + } + + // Dump a table for each column family with entries in this log file. + for (auto* cfd : *vset_.GetColumnFamilySet()) { + // Do not record a version edit for this conversion to a Table + // since ExtractMetaData() will also generate edits. + MemTable* mem = cfd->mem(); + if (mem->IsEmpty()) { + continue; + } + + FileMetaData meta; + meta.fd = FileDescriptor(next_file_number_++, 0, 0); + ReadOptions ro; + ro.total_order_seek = true; + Arena arena; + ScopedArenaIterator iter(mem->NewIterator(ro, &arena)); + int64_t _current_time = 0; + status = env_->GetCurrentTime(&_current_time); // ignore error + const uint64_t current_time = static_cast(_current_time); + SnapshotChecker* snapshot_checker = DisableGCSnapshotChecker::Instance(); + + auto write_hint = cfd->CalculateSSTWriteHint(0); + std::vector> + range_del_iters; + auto range_del_iter = + mem->NewRangeTombstoneIterator(ro, kMaxSequenceNumber); + if (range_del_iter != nullptr) { + range_del_iters.emplace_back(range_del_iter); + } + + LegacyFileSystemWrapper fs(env_); + status = BuildTable( + dbname_, env_, &fs, *cfd->ioptions(), + *cfd->GetLatestMutableCFOptions(), env_options_, table_cache_, + iter.get(), std::move(range_del_iters), &meta, + cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(), + cfd->GetID(), cfd->GetName(), {}, kMaxSequenceNumber, + snapshot_checker, kNoCompression, 0 /* sample_for_compression */, + CompressionOptions(), false, nullptr /* internal_stats */, + TableFileCreationReason::kRecovery, nullptr /* event_logger */, + 0 /* job_id */, Env::IO_HIGH, nullptr /* table_properties */, + -1 /* level */, current_time, write_hint); + ROCKS_LOG_INFO(db_options_.info_log, + "Log #%" PRIu64 ": %d ops saved to Table #%" PRIu64 " %s", + log, counter, meta.fd.GetNumber(), + status.ToString().c_str()); + if (status.ok()) { + if (meta.fd.GetFileSize() > 0) { + table_fds_.push_back(meta.fd); + } + } else { + break; + } + } + delete cf_mems; + return status; + } + + void ExtractMetaData() { + for (size_t i = 0; i < table_fds_.size(); i++) { + TableInfo t; + t.meta.fd = table_fds_[i]; + Status status = ScanTable(&t); + if (!status.ok()) { + std::string fname = TableFileName( + db_options_.db_paths, t.meta.fd.GetNumber(), t.meta.fd.GetPathId()); + char file_num_buf[kFormatFileNumberBufSize]; + FormatFileNumber(t.meta.fd.GetNumber(), t.meta.fd.GetPathId(), + file_num_buf, sizeof(file_num_buf)); + ROCKS_LOG_WARN(db_options_.info_log, "Table #%s: ignoring %s", + file_num_buf, status.ToString().c_str()); + ArchiveFile(fname); + } else { + tables_.push_back(t); + } + } + } + + Status ScanTable(TableInfo* t) { + std::string fname = TableFileName( + db_options_.db_paths, t->meta.fd.GetNumber(), t->meta.fd.GetPathId()); + int counter = 0; + uint64_t file_size; + Status status = env_->GetFileSize(fname, &file_size); + t->meta.fd = FileDescriptor(t->meta.fd.GetNumber(), t->meta.fd.GetPathId(), + file_size); + std::shared_ptr props; + if (status.ok()) { + status = table_cache_->GetTableProperties(env_options_, icmp_, t->meta.fd, + &props); + } + if (status.ok()) { + t->column_family_id = static_cast(props->column_family_id); + if (t->column_family_id == + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) { + ROCKS_LOG_WARN( + db_options_.info_log, + "Table #%" PRIu64 + ": column family unknown (probably due to legacy format); " + "adding to default column family id 0.", + t->meta.fd.GetNumber()); + t->column_family_id = 0; + } + + if (vset_.GetColumnFamilySet()->GetColumnFamily(t->column_family_id) == + nullptr) { + status = + AddColumnFamily(props->column_family_name, t->column_family_id); + } + t->meta.oldest_ancester_time = props->creation_time; + } + ColumnFamilyData* cfd = nullptr; + if (status.ok()) { + cfd = vset_.GetColumnFamilySet()->GetColumnFamily(t->column_family_id); + if (cfd->GetName() != props->column_family_name) { + ROCKS_LOG_ERROR( + db_options_.info_log, + "Table #%" PRIu64 + ": inconsistent column family name '%s'; expected '%s' for column " + "family id %" PRIu32 ".", + t->meta.fd.GetNumber(), props->column_family_name.c_str(), + cfd->GetName().c_str(), t->column_family_id); + status = Status::Corruption(dbname_, "inconsistent column family name"); + } + } + if (status.ok()) { + ReadOptions ropts; + ropts.total_order_seek = true; + InternalIterator* iter = table_cache_->NewIterator( + ropts, env_options_, cfd->internal_comparator(), t->meta, + nullptr /* range_del_agg */, + cfd->GetLatestMutableCFOptions()->prefix_extractor.get(), + /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr, + TableReaderCaller::kRepair, /*arena=*/nullptr, /*skip_filters=*/false, + /*level=*/-1, /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr); + ParsedInternalKey parsed; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + Slice key = iter->key(); + if (!ParseInternalKey(key, &parsed)) { + ROCKS_LOG_ERROR(db_options_.info_log, + "Table #%" PRIu64 ": unparsable key %s", + t->meta.fd.GetNumber(), EscapeString(key).c_str()); + continue; + } + + counter++; + + t->meta.UpdateBoundaries(key, iter->value(), parsed.sequence, + parsed.type); + } + if (!iter->status().ok()) { + status = iter->status(); + } + delete iter; + + ROCKS_LOG_INFO(db_options_.info_log, "Table #%" PRIu64 ": %d entries %s", + t->meta.fd.GetNumber(), counter, + status.ToString().c_str()); + } + return status; + } + + Status AddTables() { + std::unordered_map> cf_id_to_tables; + SequenceNumber max_sequence = 0; + for (size_t i = 0; i < tables_.size(); i++) { + cf_id_to_tables[tables_[i].column_family_id].push_back(&tables_[i]); + if (max_sequence < tables_[i].meta.fd.largest_seqno) { + max_sequence = tables_[i].meta.fd.largest_seqno; + } + } + vset_.SetLastAllocatedSequence(max_sequence); + vset_.SetLastPublishedSequence(max_sequence); + vset_.SetLastSequence(max_sequence); + + for (const auto& cf_id_and_tables : cf_id_to_tables) { + auto* cfd = + vset_.GetColumnFamilySet()->GetColumnFamily(cf_id_and_tables.first); + VersionEdit edit; + edit.SetComparatorName(cfd->user_comparator()->Name()); + edit.SetLogNumber(0); + edit.SetNextFile(next_file_number_); + edit.SetColumnFamily(cfd->GetID()); + + // TODO(opt): separate out into multiple levels + for (const auto* table : cf_id_and_tables.second) { + edit.AddFile( + 0, table->meta.fd.GetNumber(), table->meta.fd.GetPathId(), + table->meta.fd.GetFileSize(), table->meta.smallest, + table->meta.largest, table->meta.fd.smallest_seqno, + table->meta.fd.largest_seqno, table->meta.marked_for_compaction, + table->meta.oldest_blob_file_number, + table->meta.oldest_ancester_time, table->meta.file_creation_time, + table->meta.file_checksum, table->meta.file_checksum_func_name); + } + assert(next_file_number_ > 0); + vset_.MarkFileNumberUsed(next_file_number_ - 1); + mutex_.Lock(); + Status status = vset_.LogAndApply( + cfd, *cfd->GetLatestMutableCFOptions(), &edit, &mutex_, + nullptr /* db_directory */, false /* new_descriptor_log */); + mutex_.Unlock(); + if (!status.ok()) { + return status; + } + } + return Status::OK(); + } + + void ArchiveFile(const std::string& fname) { + // Move into another directory. E.g., for + // dir/foo + // rename to + // dir/lost/foo + const char* slash = strrchr(fname.c_str(), '/'); + std::string new_dir; + if (slash != nullptr) { + new_dir.assign(fname.data(), slash - fname.data()); + } + new_dir.append("/lost"); + env_->CreateDir(new_dir); // Ignore error + std::string new_file = new_dir; + new_file.append("/"); + new_file.append((slash == nullptr) ? fname.c_str() : slash + 1); + Status s = env_->RenameFile(fname, new_file); + ROCKS_LOG_INFO(db_options_.info_log, "Archiving %s: %s\n", fname.c_str(), + s.ToString().c_str()); + } +}; + +Status GetDefaultCFOptions( + const std::vector& column_families, + ColumnFamilyOptions* res) { + assert(res != nullptr); + auto iter = std::find_if(column_families.begin(), column_families.end(), + [](const ColumnFamilyDescriptor& cfd) { + return cfd.name == kDefaultColumnFamilyName; + }); + if (iter == column_families.end()) { + return Status::InvalidArgument( + "column_families", "Must contain entry for default column family"); + } + *res = iter->options; + return Status::OK(); +} +} // anonymous namespace + +Status RepairDB(const std::string& dbname, const DBOptions& db_options, + const std::vector& column_families + ) { + ColumnFamilyOptions default_cf_opts; + Status status = GetDefaultCFOptions(column_families, &default_cf_opts); + if (status.ok()) { + Repairer repairer(dbname, db_options, column_families, + default_cf_opts, + ColumnFamilyOptions() /* unknown_cf_opts */, + false /* create_unknown_cfs */); + status = repairer.Run(); + } + return status; +} + +Status RepairDB(const std::string& dbname, const DBOptions& db_options, + const std::vector& column_families, + const ColumnFamilyOptions& unknown_cf_opts) { + ColumnFamilyOptions default_cf_opts; + Status status = GetDefaultCFOptions(column_families, &default_cf_opts); + if (status.ok()) { + Repairer repairer(dbname, db_options, + column_families, default_cf_opts, + unknown_cf_opts, true /* create_unknown_cfs */); + status = repairer.Run(); + } + return status; +} + +Status RepairDB(const std::string& dbname, const Options& options) { + Options opts(options); + if (opts.file_system == nullptr) { + opts.file_system.reset(new LegacyFileSystemWrapper(opts.env)); + ; + } + + DBOptions db_options(opts); + ColumnFamilyOptions cf_options(opts); + Repairer repairer(dbname, db_options, + {}, cf_options /* default_cf_opts */, + cf_options /* unknown_cf_opts */, + true /* create_unknown_cfs */); + return repairer.Run(); +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/db/repair_test.cc b/src/rocksdb/db/repair_test.cc new file mode 100644 index 000000000..ba2bae3d0 --- /dev/null +++ b/src/rocksdb/db/repair_test.cc @@ -0,0 +1,369 @@ +// Copyright (c) 2016-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" +#include "file/file_util.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" +#include "rocksdb/transaction_log.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +#ifndef ROCKSDB_LITE +class RepairTest : public DBTestBase { + public: + RepairTest() : DBTestBase("/repair_test") {} + + std::string GetFirstSstPath() { + uint64_t manifest_size; + std::vector files; + db_->GetLiveFiles(files, &manifest_size); + auto sst_iter = + std::find_if(files.begin(), files.end(), [](const std::string& file) { + uint64_t number; + FileType type; + bool ok = ParseFileName(file, &number, &type); + return ok && type == kTableFile; + }); + return sst_iter == files.end() ? "" : dbname_ + *sst_iter; + } +}; + +TEST_F(RepairTest, LostManifest) { + // Add a couple SST files, delete the manifest, and verify RepairDB() saves + // the day. + Put("key", "val"); + Flush(); + Put("key2", "val2"); + Flush(); + // Need to get path before Close() deletes db_, but delete it after Close() to + // ensure Close() didn't change the manifest. + std::string manifest_path = + DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo()); + + Close(); + ASSERT_OK(env_->FileExists(manifest_path)); + ASSERT_OK(env_->DeleteFile(manifest_path)); + ASSERT_OK(RepairDB(dbname_, CurrentOptions())); + Reopen(CurrentOptions()); + + ASSERT_EQ(Get("key"), "val"); + ASSERT_EQ(Get("key2"), "val2"); +} + +TEST_F(RepairTest, CorruptManifest) { + // Manifest is in an invalid format. Expect a full recovery. + Put("key", "val"); + Flush(); + Put("key2", "val2"); + Flush(); + // Need to get path before Close() deletes db_, but overwrite it after Close() + // to ensure Close() didn't change the manifest. + std::string manifest_path = + DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo()); + + Close(); + ASSERT_OK(env_->FileExists(manifest_path)); + + LegacyFileSystemWrapper fs(env_); + CreateFile(&fs, manifest_path, "blah", false /* use_fsync */); + ASSERT_OK(RepairDB(dbname_, CurrentOptions())); + Reopen(CurrentOptions()); + + ASSERT_EQ(Get("key"), "val"); + ASSERT_EQ(Get("key2"), "val2"); +} + +TEST_F(RepairTest, IncompleteManifest) { + // In this case, the manifest is valid but does not reference all of the SST + // files. Expect a full recovery. + Put("key", "val"); + Flush(); + std::string orig_manifest_path = + DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo()); + CopyFile(orig_manifest_path, orig_manifest_path + ".tmp"); + Put("key2", "val2"); + Flush(); + // Need to get path before Close() deletes db_, but overwrite it after Close() + // to ensure Close() didn't change the manifest. + std::string new_manifest_path = + DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo()); + + Close(); + ASSERT_OK(env_->FileExists(new_manifest_path)); + // Replace the manifest with one that is only aware of the first SST file. + CopyFile(orig_manifest_path + ".tmp", new_manifest_path); + ASSERT_OK(RepairDB(dbname_, CurrentOptions())); + Reopen(CurrentOptions()); + + ASSERT_EQ(Get("key"), "val"); + ASSERT_EQ(Get("key2"), "val2"); +} + +TEST_F(RepairTest, PostRepairSstFileNumbering) { + // Verify after a DB is repaired, new files will be assigned higher numbers + // than old files. + Put("key", "val"); + Flush(); + Put("key2", "val2"); + Flush(); + uint64_t pre_repair_file_num = dbfull()->TEST_Current_Next_FileNo(); + Close(); + + ASSERT_OK(RepairDB(dbname_, CurrentOptions())); + + Reopen(CurrentOptions()); + uint64_t post_repair_file_num = dbfull()->TEST_Current_Next_FileNo(); + ASSERT_GE(post_repair_file_num, pre_repair_file_num); +} + +TEST_F(RepairTest, LostSst) { + // Delete one of the SST files but preserve the manifest that refers to it, + // then verify the DB is still usable for the intact SST. + Put("key", "val"); + Flush(); + Put("key2", "val2"); + Flush(); + auto sst_path = GetFirstSstPath(); + ASSERT_FALSE(sst_path.empty()); + ASSERT_OK(env_->DeleteFile(sst_path)); + + Close(); + ASSERT_OK(RepairDB(dbname_, CurrentOptions())); + Reopen(CurrentOptions()); + + // Exactly one of the key-value pairs should be in the DB now. + ASSERT_TRUE((Get("key") == "val") != (Get("key2") == "val2")); +} + +TEST_F(RepairTest, CorruptSst) { + // Corrupt one of the SST files but preserve the manifest that refers to it, + // then verify the DB is still usable for the intact SST. + Put("key", "val"); + Flush(); + Put("key2", "val2"); + Flush(); + auto sst_path = GetFirstSstPath(); + ASSERT_FALSE(sst_path.empty()); + + LegacyFileSystemWrapper fs(env_); + CreateFile(&fs, sst_path, "blah", false /* use_fsync */); + + Close(); + ASSERT_OK(RepairDB(dbname_, CurrentOptions())); + Reopen(CurrentOptions()); + + // Exactly one of the key-value pairs should be in the DB now. + ASSERT_TRUE((Get("key") == "val") != (Get("key2") == "val2")); +} + +TEST_F(RepairTest, UnflushedSst) { + // This test case invokes repair while some data is unflushed, then verifies + // that data is in the db. + Put("key", "val"); + VectorLogPtr wal_files; + ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files)); + ASSERT_EQ(wal_files.size(), 1); + uint64_t total_ssts_size; + GetAllSSTFiles(&total_ssts_size); + ASSERT_EQ(total_ssts_size, 0); + // Need to get path before Close() deletes db_, but delete it after Close() to + // ensure Close() didn't change the manifest. + std::string manifest_path = + DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo()); + + Close(); + ASSERT_OK(env_->FileExists(manifest_path)); + ASSERT_OK(env_->DeleteFile(manifest_path)); + ASSERT_OK(RepairDB(dbname_, CurrentOptions())); + Reopen(CurrentOptions()); + + ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files)); + ASSERT_EQ(wal_files.size(), 0); + GetAllSSTFiles(&total_ssts_size); + ASSERT_GT(total_ssts_size, 0); + ASSERT_EQ(Get("key"), "val"); +} + +TEST_F(RepairTest, SeparateWalDir) { + do { + Options options = CurrentOptions(); + DestroyAndReopen(options); + Put("key", "val"); + Put("foo", "bar"); + VectorLogPtr wal_files; + ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files)); + ASSERT_EQ(wal_files.size(), 1); + uint64_t total_ssts_size; + GetAllSSTFiles(&total_ssts_size); + ASSERT_EQ(total_ssts_size, 0); + std::string manifest_path = + DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo()); + + Close(); + ASSERT_OK(env_->FileExists(manifest_path)); + ASSERT_OK(env_->DeleteFile(manifest_path)); + ASSERT_OK(RepairDB(dbname_, options)); + + // make sure that all WALs are converted to SSTables. + options.wal_dir = ""; + + Reopen(options); + ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files)); + ASSERT_EQ(wal_files.size(), 0); + GetAllSSTFiles(&total_ssts_size); + ASSERT_GT(total_ssts_size, 0); + ASSERT_EQ(Get("key"), "val"); + ASSERT_EQ(Get("foo"), "bar"); + + } while(ChangeWalOptions()); +} + +TEST_F(RepairTest, RepairMultipleColumnFamilies) { + // Verify repair logic associates SST files with their original column + // families. + const int kNumCfs = 3; + const int kEntriesPerCf = 2; + DestroyAndReopen(CurrentOptions()); + CreateAndReopenWithCF({"pikachu1", "pikachu2"}, CurrentOptions()); + for (int i = 0; i < kNumCfs; ++i) { + for (int j = 0; j < kEntriesPerCf; ++j) { + Put(i, "key" + ToString(j), "val" + ToString(j)); + if (j == kEntriesPerCf - 1 && i == kNumCfs - 1) { + // Leave one unflushed so we can verify WAL entries are properly + // associated with column families. + continue; + } + Flush(i); + } + } + + // Need to get path before Close() deletes db_, but delete it after Close() to + // ensure Close() doesn't re-create the manifest. + std::string manifest_path = + DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo()); + Close(); + ASSERT_OK(env_->FileExists(manifest_path)); + ASSERT_OK(env_->DeleteFile(manifest_path)); + + ASSERT_OK(RepairDB(dbname_, CurrentOptions())); + + ReopenWithColumnFamilies({"default", "pikachu1", "pikachu2"}, + CurrentOptions()); + for (int i = 0; i < kNumCfs; ++i) { + for (int j = 0; j < kEntriesPerCf; ++j) { + ASSERT_EQ(Get(i, "key" + ToString(j)), "val" + ToString(j)); + } + } +} + +TEST_F(RepairTest, RepairColumnFamilyOptions) { + // Verify repair logic uses correct ColumnFamilyOptions when repairing a + // database with different options for column families. + const int kNumCfs = 2; + const int kEntriesPerCf = 2; + + Options opts(CurrentOptions()), rev_opts(CurrentOptions()); + opts.comparator = BytewiseComparator(); + rev_opts.comparator = ReverseBytewiseComparator(); + + DestroyAndReopen(opts); + CreateColumnFamilies({"reverse"}, rev_opts); + ReopenWithColumnFamilies({"default", "reverse"}, + std::vector{opts, rev_opts}); + for (int i = 0; i < kNumCfs; ++i) { + for (int j = 0; j < kEntriesPerCf; ++j) { + Put(i, "key" + ToString(j), "val" + ToString(j)); + if (i == kNumCfs - 1 && j == kEntriesPerCf - 1) { + // Leave one unflushed so we can verify RepairDB's flush logic + continue; + } + Flush(i); + } + } + Close(); + + // RepairDB() records the comparator in the manifest, and DB::Open would fail + // if a different comparator were used. + ASSERT_OK(RepairDB(dbname_, opts, {{"default", opts}, {"reverse", rev_opts}}, + opts /* unknown_cf_opts */)); + ASSERT_OK(TryReopenWithColumnFamilies({"default", "reverse"}, + std::vector{opts, rev_opts})); + for (int i = 0; i < kNumCfs; ++i) { + for (int j = 0; j < kEntriesPerCf; ++j) { + ASSERT_EQ(Get(i, "key" + ToString(j)), "val" + ToString(j)); + } + } + + // Examine table properties to verify RepairDB() used the right options when + // converting WAL->SST + TablePropertiesCollection fname_to_props; + db_->GetPropertiesOfAllTables(handles_[1], &fname_to_props); + ASSERT_EQ(fname_to_props.size(), 2U); + for (const auto& fname_and_props : fname_to_props) { + std::string comparator_name ( + InternalKeyComparator(rev_opts.comparator).Name()); + comparator_name = comparator_name.substr(comparator_name.find(':') + 1); + ASSERT_EQ(comparator_name, + fname_and_props.second->comparator_name); + } + Close(); + + // Also check comparator when it's provided via "unknown" CF options + ASSERT_OK(RepairDB(dbname_, opts, {{"default", opts}}, + rev_opts /* unknown_cf_opts */)); + ASSERT_OK(TryReopenWithColumnFamilies({"default", "reverse"}, + std::vector{opts, rev_opts})); + for (int i = 0; i < kNumCfs; ++i) { + for (int j = 0; j < kEntriesPerCf; ++j) { + ASSERT_EQ(Get(i, "key" + ToString(j)), "val" + ToString(j)); + } + } +} + +TEST_F(RepairTest, DbNameContainsTrailingSlash) { + { + bool tmp; + if (env_->AreFilesSame("", "", &tmp).IsNotSupported()) { + fprintf(stderr, + "skipping RepairTest.DbNameContainsTrailingSlash due to " + "unsupported Env::AreFilesSame\n"); + return; + } + } + + Put("key", "val"); + Flush(); + Close(); + + ASSERT_OK(RepairDB(dbname_ + "/", CurrentOptions())); + Reopen(CurrentOptions()); + ASSERT_EQ(Get("key"), "val"); +} +#endif // ROCKSDB_LITE +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "SKIPPED as RepairDB is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/db/snapshot_checker.h b/src/rocksdb/db/snapshot_checker.h new file mode 100644 index 000000000..1d2c2c316 --- /dev/null +++ b/src/rocksdb/db/snapshot_checker.h @@ -0,0 +1,61 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +enum class SnapshotCheckerResult : int { + kInSnapshot = 0, + kNotInSnapshot = 1, + // In case snapshot is released and the checker has no clue whether + // the given sequence is visible to the snapshot. + kSnapshotReleased = 2, +}; + +// Callback class that control GC of duplicate keys in flush/compaction. +class SnapshotChecker { + public: + virtual ~SnapshotChecker() {} + virtual SnapshotCheckerResult CheckInSnapshot( + SequenceNumber sequence, SequenceNumber snapshot_sequence) const = 0; +}; + +class DisableGCSnapshotChecker : public SnapshotChecker { + public: + virtual ~DisableGCSnapshotChecker() {} + virtual SnapshotCheckerResult CheckInSnapshot( + SequenceNumber /*sequence*/, + SequenceNumber /*snapshot_sequence*/) const override { + // By returning kNotInSnapshot, we prevent all the values from being GCed + return SnapshotCheckerResult::kNotInSnapshot; + } + static DisableGCSnapshotChecker* Instance() { return &instance_; } + + protected: + static DisableGCSnapshotChecker instance_; + explicit DisableGCSnapshotChecker() {} +}; + +class WritePreparedTxnDB; + +// Callback class created by WritePreparedTxnDB to check if a key +// is visible by a snapshot. +class WritePreparedSnapshotChecker : public SnapshotChecker { + public: + explicit WritePreparedSnapshotChecker(WritePreparedTxnDB* txn_db); + virtual ~WritePreparedSnapshotChecker() {} + + virtual SnapshotCheckerResult CheckInSnapshot( + SequenceNumber sequence, SequenceNumber snapshot_sequence) const override; + + private: +#ifndef ROCKSDB_LITE + const WritePreparedTxnDB* const txn_db_; +#endif // !ROCKSDB_LITE +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/snapshot_impl.cc b/src/rocksdb/db/snapshot_impl.cc new file mode 100644 index 000000000..b9228c797 --- /dev/null +++ b/src/rocksdb/db/snapshot_impl.cc @@ -0,0 +1,26 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/snapshot.h" + +#include "rocksdb/db.h" + +namespace ROCKSDB_NAMESPACE { + +ManagedSnapshot::ManagedSnapshot(DB* db) : db_(db), + snapshot_(db->GetSnapshot()) {} + +ManagedSnapshot::ManagedSnapshot(DB* db, const Snapshot* _snapshot) + : db_(db), snapshot_(_snapshot) {} + +ManagedSnapshot::~ManagedSnapshot() { + if (snapshot_) { + db_->ReleaseSnapshot(snapshot_); + } +} + +const Snapshot* ManagedSnapshot::snapshot() { return snapshot_;} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/snapshot_impl.h b/src/rocksdb/db/snapshot_impl.h new file mode 100644 index 000000000..785f814f8 --- /dev/null +++ b/src/rocksdb/db/snapshot_impl.h @@ -0,0 +1,167 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include + +#include "rocksdb/db.h" + +namespace ROCKSDB_NAMESPACE { + +class SnapshotList; + +// Snapshots are kept in a doubly-linked list in the DB. +// Each SnapshotImpl corresponds to a particular sequence number. +class SnapshotImpl : public Snapshot { + public: + SequenceNumber number_; // const after creation + // It indicates the smallest uncommitted data at the time the snapshot was + // taken. This is currently used by WritePrepared transactions to limit the + // scope of queries to IsInSnpashot. + SequenceNumber min_uncommitted_ = kMinUnCommittedSeq; + + virtual SequenceNumber GetSequenceNumber() const override { return number_; } + + private: + friend class SnapshotList; + + // SnapshotImpl is kept in a doubly-linked circular list + SnapshotImpl* prev_; + SnapshotImpl* next_; + + SnapshotList* list_; // just for sanity checks + + int64_t unix_time_; + + // Will this snapshot be used by a Transaction to do write-conflict checking? + bool is_write_conflict_boundary_; +}; + +class SnapshotList { + public: + SnapshotList() { + list_.prev_ = &list_; + list_.next_ = &list_; + list_.number_ = 0xFFFFFFFFL; // placeholder marker, for debugging + // Set all the variables to make UBSAN happy. + list_.list_ = nullptr; + list_.unix_time_ = 0; + list_.is_write_conflict_boundary_ = false; + count_ = 0; + } + + // No copy-construct. + SnapshotList(const SnapshotList&) = delete; + + bool empty() const { return list_.next_ == &list_; } + SnapshotImpl* oldest() const { assert(!empty()); return list_.next_; } + SnapshotImpl* newest() const { assert(!empty()); return list_.prev_; } + + SnapshotImpl* New(SnapshotImpl* s, SequenceNumber seq, uint64_t unix_time, + bool is_write_conflict_boundary) { + s->number_ = seq; + s->unix_time_ = unix_time; + s->is_write_conflict_boundary_ = is_write_conflict_boundary; + s->list_ = this; + s->next_ = &list_; + s->prev_ = list_.prev_; + s->prev_->next_ = s; + s->next_->prev_ = s; + count_++; + return s; + } + + // Do not responsible to free the object. + void Delete(const SnapshotImpl* s) { + assert(s->list_ == this); + s->prev_->next_ = s->next_; + s->next_->prev_ = s->prev_; + count_--; + } + + // retrieve all snapshot numbers up until max_seq. They are sorted in + // ascending order (with no duplicates). + std::vector GetAll( + SequenceNumber* oldest_write_conflict_snapshot = nullptr, + const SequenceNumber& max_seq = kMaxSequenceNumber) const { + std::vector ret; + GetAll(&ret, oldest_write_conflict_snapshot, max_seq); + return ret; + } + + void GetAll(std::vector* snap_vector, + SequenceNumber* oldest_write_conflict_snapshot = nullptr, + const SequenceNumber& max_seq = kMaxSequenceNumber) const { + std::vector& ret = *snap_vector; + // So far we have no use case that would pass a non-empty vector + assert(ret.size() == 0); + + if (oldest_write_conflict_snapshot != nullptr) { + *oldest_write_conflict_snapshot = kMaxSequenceNumber; + } + + if (empty()) { + return; + } + const SnapshotImpl* s = &list_; + while (s->next_ != &list_) { + if (s->next_->number_ > max_seq) { + break; + } + // Avoid duplicates + if (ret.empty() || ret.back() != s->next_->number_) { + ret.push_back(s->next_->number_); + } + + if (oldest_write_conflict_snapshot != nullptr && + *oldest_write_conflict_snapshot == kMaxSequenceNumber && + s->next_->is_write_conflict_boundary_) { + // If this is the first write-conflict boundary snapshot in the list, + // it is the oldest + *oldest_write_conflict_snapshot = s->next_->number_; + } + + s = s->next_; + } + return; + } + + // get the sequence number of the most recent snapshot + SequenceNumber GetNewest() { + if (empty()) { + return 0; + } + return newest()->number_; + } + + int64_t GetOldestSnapshotTime() const { + if (empty()) { + return 0; + } else { + return oldest()->unix_time_; + } + } + + int64_t GetOldestSnapshotSequence() const { + if (empty()) { + return 0; + } else { + return oldest()->GetSequenceNumber(); + } + } + + uint64_t count() const { return count_; } + + private: + // Dummy head of doubly-linked list of snapshots + SnapshotImpl list_; + uint64_t count_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/table_cache.cc b/src/rocksdb/db/table_cache.cc new file mode 100644 index 000000000..411959a33 --- /dev/null +++ b/src/rocksdb/db/table_cache.cc @@ -0,0 +1,668 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/table_cache.h" + +#include "db/dbformat.h" +#include "db/range_tombstone_fragmenter.h" +#include "db/snapshot_impl.h" +#include "db/version_edit.h" +#include "file/filename.h" +#include "file/random_access_file_reader.h" +#include "monitoring/perf_context_imp.h" +#include "rocksdb/statistics.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/get_context.h" +#include "table/internal_iterator.h" +#include "table/iterator_wrapper.h" +#include "table/multiget_context.h" +#include "table/table_builder.h" +#include "table/table_reader.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" +#include "util/coding.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +template +static void DeleteEntry(const Slice& /*key*/, void* value) { + T* typed_value = reinterpret_cast(value); + delete typed_value; +} + +static void UnrefEntry(void* arg1, void* arg2) { + Cache* cache = reinterpret_cast(arg1); + Cache::Handle* h = reinterpret_cast(arg2); + cache->Release(h); +} + +static Slice GetSliceForFileNumber(const uint64_t* file_number) { + return Slice(reinterpret_cast(file_number), + sizeof(*file_number)); +} + +#ifndef ROCKSDB_LITE + +void AppendVarint64(IterKey* key, uint64_t v) { + char buf[10]; + auto ptr = EncodeVarint64(buf, v); + key->TrimAppend(key->Size(), buf, ptr - buf); +} + +#endif // ROCKSDB_LITE + +} // namespace + +TableCache::TableCache(const ImmutableCFOptions& ioptions, + const FileOptions& file_options, Cache* const cache, + BlockCacheTracer* const block_cache_tracer) + : ioptions_(ioptions), + file_options_(file_options), + cache_(cache), + immortal_tables_(false), + block_cache_tracer_(block_cache_tracer) { + if (ioptions_.row_cache) { + // If the same cache is shared by multiple instances, we need to + // disambiguate its entries. + PutVarint64(&row_cache_id_, ioptions_.row_cache->NewId()); + } +} + +TableCache::~TableCache() { +} + +TableReader* TableCache::GetTableReaderFromHandle(Cache::Handle* handle) { + return reinterpret_cast(cache_->Value(handle)); +} + +void TableCache::ReleaseHandle(Cache::Handle* handle) { + cache_->Release(handle); +} + +Status TableCache::GetTableReader( + const FileOptions& file_options, + const InternalKeyComparator& internal_comparator, const FileDescriptor& fd, + bool sequential_mode, bool record_read_stats, HistogramImpl* file_read_hist, + std::unique_ptr* table_reader, + const SliceTransform* prefix_extractor, bool skip_filters, int level, + bool prefetch_index_and_filter_in_cache) { + std::string fname = + TableFileName(ioptions_.cf_paths, fd.GetNumber(), fd.GetPathId()); + std::unique_ptr file; + Status s = ioptions_.fs->NewRandomAccessFile(fname, file_options, &file, + nullptr); + RecordTick(ioptions_.statistics, NO_FILE_OPENS); + if (s.IsPathNotFound()) { + fname = Rocks2LevelTableFileName(fname); + s = ioptions_.fs->NewRandomAccessFile(fname, file_options, &file, nullptr); + RecordTick(ioptions_.statistics, NO_FILE_OPENS); + } + + if (s.ok()) { + if (!sequential_mode && ioptions_.advise_random_on_open) { + file->Hint(FSRandomAccessFile::kRandom); + } + StopWatch sw(ioptions_.env, ioptions_.statistics, TABLE_OPEN_IO_MICROS); + std::unique_ptr file_reader( + new RandomAccessFileReader( + std::move(file), fname, ioptions_.env, + record_read_stats ? ioptions_.statistics : nullptr, SST_READ_MICROS, + file_read_hist, ioptions_.rate_limiter, ioptions_.listeners)); + s = ioptions_.table_factory->NewTableReader( + TableReaderOptions(ioptions_, prefix_extractor, file_options, + internal_comparator, skip_filters, immortal_tables_, + level, fd.largest_seqno, block_cache_tracer_), + std::move(file_reader), fd.GetFileSize(), table_reader, + prefetch_index_and_filter_in_cache); + TEST_SYNC_POINT("TableCache::GetTableReader:0"); + } + return s; +} + +void TableCache::EraseHandle(const FileDescriptor& fd, Cache::Handle* handle) { + ReleaseHandle(handle); + uint64_t number = fd.GetNumber(); + Slice key = GetSliceForFileNumber(&number); + cache_->Erase(key); +} + +Status TableCache::FindTable(const FileOptions& file_options, + const InternalKeyComparator& internal_comparator, + const FileDescriptor& fd, Cache::Handle** handle, + const SliceTransform* prefix_extractor, + const bool no_io, bool record_read_stats, + HistogramImpl* file_read_hist, bool skip_filters, + int level, + bool prefetch_index_and_filter_in_cache) { + PERF_TIMER_GUARD_WITH_ENV(find_table_nanos, ioptions_.env); + Status s; + uint64_t number = fd.GetNumber(); + Slice key = GetSliceForFileNumber(&number); + *handle = cache_->Lookup(key); + TEST_SYNC_POINT_CALLBACK("TableCache::FindTable:0", + const_cast(&no_io)); + + if (*handle == nullptr) { + if (no_io) { // Don't do IO and return a not-found status + return Status::Incomplete("Table not found in table_cache, no_io is set"); + } + std::unique_ptr table_reader; + s = GetTableReader(file_options, internal_comparator, fd, + false /* sequential mode */, record_read_stats, + file_read_hist, &table_reader, prefix_extractor, + skip_filters, level, prefetch_index_and_filter_in_cache); + if (!s.ok()) { + assert(table_reader == nullptr); + RecordTick(ioptions_.statistics, NO_FILE_ERRORS); + // We do not cache error results so that if the error is transient, + // or somebody repairs the file, we recover automatically. + } else { + s = cache_->Insert(key, table_reader.get(), 1, &DeleteEntry, + handle); + if (s.ok()) { + // Release ownership of table reader. + table_reader.release(); + } + } + } + return s; +} + +InternalIterator* TableCache::NewIterator( + const ReadOptions& options, const FileOptions& file_options, + const InternalKeyComparator& icomparator, const FileMetaData& file_meta, + RangeDelAggregator* range_del_agg, const SliceTransform* prefix_extractor, + TableReader** table_reader_ptr, HistogramImpl* file_read_hist, + TableReaderCaller caller, Arena* arena, bool skip_filters, int level, + const InternalKey* smallest_compaction_key, + const InternalKey* largest_compaction_key) { + PERF_TIMER_GUARD(new_table_iterator_nanos); + + Status s; + TableReader* table_reader = nullptr; + Cache::Handle* handle = nullptr; + if (table_reader_ptr != nullptr) { + *table_reader_ptr = nullptr; + } + bool for_compaction = caller == TableReaderCaller::kCompaction; + auto& fd = file_meta.fd; + table_reader = fd.table_reader; + if (table_reader == nullptr) { + s = FindTable(file_options, icomparator, fd, &handle, prefix_extractor, + options.read_tier == kBlockCacheTier /* no_io */, + !for_compaction /* record_read_stats */, file_read_hist, + skip_filters, level); + if (s.ok()) { + table_reader = GetTableReaderFromHandle(handle); + } + } + InternalIterator* result = nullptr; + if (s.ok()) { + if (options.table_filter && + !options.table_filter(*table_reader->GetTableProperties())) { + result = NewEmptyInternalIterator(arena); + } else { + result = table_reader->NewIterator(options, prefix_extractor, arena, + skip_filters, caller, + file_options.compaction_readahead_size); + } + if (handle != nullptr) { + result->RegisterCleanup(&UnrefEntry, cache_, handle); + handle = nullptr; // prevent from releasing below + } + + if (for_compaction) { + table_reader->SetupForCompaction(); + } + if (table_reader_ptr != nullptr) { + *table_reader_ptr = table_reader; + } + } + if (s.ok() && range_del_agg != nullptr && !options.ignore_range_deletions) { + if (range_del_agg->AddFile(fd.GetNumber())) { + std::unique_ptr range_del_iter( + static_cast( + table_reader->NewRangeTombstoneIterator(options))); + if (range_del_iter != nullptr) { + s = range_del_iter->status(); + } + if (s.ok()) { + const InternalKey* smallest = &file_meta.smallest; + const InternalKey* largest = &file_meta.largest; + if (smallest_compaction_key != nullptr) { + smallest = smallest_compaction_key; + } + if (largest_compaction_key != nullptr) { + largest = largest_compaction_key; + } + range_del_agg->AddTombstones(std::move(range_del_iter), smallest, + largest); + } + } + } + + if (handle != nullptr) { + ReleaseHandle(handle); + } + if (!s.ok()) { + assert(result == nullptr); + result = NewErrorInternalIterator(s, arena); + } + return result; +} + +Status TableCache::GetRangeTombstoneIterator( + const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, + std::unique_ptr* out_iter) { + const FileDescriptor& fd = file_meta.fd; + Status s; + TableReader* t = fd.table_reader; + Cache::Handle* handle = nullptr; + if (t == nullptr) { + s = FindTable(file_options_, internal_comparator, fd, &handle); + if (s.ok()) { + t = GetTableReaderFromHandle(handle); + } + } + if (s.ok()) { + out_iter->reset(t->NewRangeTombstoneIterator(options)); + assert(out_iter); + } + return s; +} + +#ifndef ROCKSDB_LITE +void TableCache::CreateRowCacheKeyPrefix(const ReadOptions& options, + const FileDescriptor& fd, + const Slice& internal_key, + GetContext* get_context, + IterKey& row_cache_key) { + uint64_t fd_number = fd.GetNumber(); + // We use the user key as cache key instead of the internal key, + // otherwise the whole cache would be invalidated every time the + // sequence key increases. However, to support caching snapshot + // reads, we append the sequence number (incremented by 1 to + // distinguish from 0) only in this case. + // If the snapshot is larger than the largest seqno in the file, + // all data should be exposed to the snapshot, so we treat it + // the same as there is no snapshot. The exception is that if + // a seq-checking callback is registered, some internal keys + // may still be filtered out. + uint64_t seq_no = 0; + // Maybe we can include the whole file ifsnapshot == fd.largest_seqno. + if (options.snapshot != nullptr && + (get_context->has_callback() || + static_cast_with_check( + options.snapshot) + ->GetSequenceNumber() <= fd.largest_seqno)) { + // We should consider to use options.snapshot->GetSequenceNumber() + // instead of GetInternalKeySeqno(k), which will make the code + // easier to understand. + seq_no = 1 + GetInternalKeySeqno(internal_key); + } + + // Compute row cache key. + row_cache_key.TrimAppend(row_cache_key.Size(), row_cache_id_.data(), + row_cache_id_.size()); + AppendVarint64(&row_cache_key, fd_number); + AppendVarint64(&row_cache_key, seq_no); +} + +bool TableCache::GetFromRowCache(const Slice& user_key, IterKey& row_cache_key, + size_t prefix_size, GetContext* get_context) { + bool found = false; + + row_cache_key.TrimAppend(prefix_size, user_key.data(), user_key.size()); + if (auto row_handle = + ioptions_.row_cache->Lookup(row_cache_key.GetUserKey())) { + // Cleanable routine to release the cache entry + Cleanable value_pinner; + auto release_cache_entry_func = [](void* cache_to_clean, + void* cache_handle) { + ((Cache*)cache_to_clean)->Release((Cache::Handle*)cache_handle); + }; + auto found_row_cache_entry = + static_cast(ioptions_.row_cache->Value(row_handle)); + // If it comes here value is located on the cache. + // found_row_cache_entry points to the value on cache, + // and value_pinner has cleanup procedure for the cached entry. + // After replayGetContextLog() returns, get_context.pinnable_slice_ + // will point to cache entry buffer (or a copy based on that) and + // cleanup routine under value_pinner will be delegated to + // get_context.pinnable_slice_. Cache entry is released when + // get_context.pinnable_slice_ is reset. + value_pinner.RegisterCleanup(release_cache_entry_func, + ioptions_.row_cache.get(), row_handle); + replayGetContextLog(*found_row_cache_entry, user_key, get_context, + &value_pinner); + RecordTick(ioptions_.statistics, ROW_CACHE_HIT); + found = true; + } else { + RecordTick(ioptions_.statistics, ROW_CACHE_MISS); + } + return found; +} +#endif // ROCKSDB_LITE + +Status TableCache::Get(const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, const Slice& k, + GetContext* get_context, + const SliceTransform* prefix_extractor, + HistogramImpl* file_read_hist, bool skip_filters, + int level) { + auto& fd = file_meta.fd; + std::string* row_cache_entry = nullptr; + bool done = false; +#ifndef ROCKSDB_LITE + IterKey row_cache_key; + std::string row_cache_entry_buffer; + + // Check row cache if enabled. Since row cache does not currently store + // sequence numbers, we cannot use it if we need to fetch the sequence. + if (ioptions_.row_cache && !get_context->NeedToReadSequence()) { + auto user_key = ExtractUserKey(k); + CreateRowCacheKeyPrefix(options, fd, k, get_context, row_cache_key); + done = GetFromRowCache(user_key, row_cache_key, row_cache_key.Size(), + get_context); + if (!done) { + row_cache_entry = &row_cache_entry_buffer; + } + } +#endif // ROCKSDB_LITE + Status s; + TableReader* t = fd.table_reader; + Cache::Handle* handle = nullptr; + if (!done && s.ok()) { + if (t == nullptr) { + s = FindTable( + file_options_, internal_comparator, fd, &handle, prefix_extractor, + options.read_tier == kBlockCacheTier /* no_io */, + true /* record_read_stats */, file_read_hist, skip_filters, level); + if (s.ok()) { + t = GetTableReaderFromHandle(handle); + } + } + SequenceNumber* max_covering_tombstone_seq = + get_context->max_covering_tombstone_seq(); + if (s.ok() && max_covering_tombstone_seq != nullptr && + !options.ignore_range_deletions) { + std::unique_ptr range_del_iter( + t->NewRangeTombstoneIterator(options)); + if (range_del_iter != nullptr) { + *max_covering_tombstone_seq = std::max( + *max_covering_tombstone_seq, + range_del_iter->MaxCoveringTombstoneSeqnum(ExtractUserKey(k))); + } + } + if (s.ok()) { + get_context->SetReplayLog(row_cache_entry); // nullptr if no cache. + s = t->Get(options, k, get_context, prefix_extractor, skip_filters); + get_context->SetReplayLog(nullptr); + } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) { + // Couldn't find Table in cache but treat as kFound if no_io set + get_context->MarkKeyMayExist(); + s = Status::OK(); + done = true; + } + } + +#ifndef ROCKSDB_LITE + // Put the replay log in row cache only if something was found. + if (!done && s.ok() && row_cache_entry && !row_cache_entry->empty()) { + size_t charge = + row_cache_key.Size() + row_cache_entry->size() + sizeof(std::string); + void* row_ptr = new std::string(std::move(*row_cache_entry)); + ioptions_.row_cache->Insert(row_cache_key.GetUserKey(), row_ptr, charge, + &DeleteEntry); + } +#endif // ROCKSDB_LITE + + if (handle != nullptr) { + ReleaseHandle(handle); + } + return s; +} + +// Batched version of TableCache::MultiGet. +Status TableCache::MultiGet(const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, + const MultiGetContext::Range* mget_range, + const SliceTransform* prefix_extractor, + HistogramImpl* file_read_hist, bool skip_filters, + int level) { + auto& fd = file_meta.fd; + Status s; + TableReader* t = fd.table_reader; + Cache::Handle* handle = nullptr; + MultiGetRange table_range(*mget_range, mget_range->begin(), + mget_range->end()); +#ifndef ROCKSDB_LITE + autovector row_cache_entries; + IterKey row_cache_key; + size_t row_cache_key_prefix_size = 0; + KeyContext& first_key = *table_range.begin(); + bool lookup_row_cache = + ioptions_.row_cache && !first_key.get_context->NeedToReadSequence(); + + // Check row cache if enabled. Since row cache does not currently store + // sequence numbers, we cannot use it if we need to fetch the sequence. + if (lookup_row_cache) { + GetContext* first_context = first_key.get_context; + CreateRowCacheKeyPrefix(options, fd, first_key.ikey, first_context, + row_cache_key); + row_cache_key_prefix_size = row_cache_key.Size(); + + for (auto miter = table_range.begin(); miter != table_range.end(); + ++miter) { + const Slice& user_key = miter->ukey; + ; + GetContext* get_context = miter->get_context; + + if (GetFromRowCache(user_key, row_cache_key, row_cache_key_prefix_size, + get_context)) { + table_range.SkipKey(miter); + } else { + row_cache_entries.emplace_back(); + get_context->SetReplayLog(&(row_cache_entries.back())); + } + } + } +#endif // ROCKSDB_LITE + + // Check that table_range is not empty. Its possible all keys may have been + // found in the row cache and thus the range may now be empty + if (s.ok() && !table_range.empty()) { + if (t == nullptr) { + s = FindTable( + file_options_, internal_comparator, fd, &handle, prefix_extractor, + options.read_tier == kBlockCacheTier /* no_io */, + true /* record_read_stats */, file_read_hist, skip_filters, level); + TEST_SYNC_POINT_CALLBACK("TableCache::MultiGet:FindTable", &s); + if (s.ok()) { + t = GetTableReaderFromHandle(handle); + assert(t); + } + } + if (s.ok() && !options.ignore_range_deletions) { + std::unique_ptr range_del_iter( + t->NewRangeTombstoneIterator(options)); + if (range_del_iter != nullptr) { + for (auto iter = table_range.begin(); iter != table_range.end(); + ++iter) { + SequenceNumber* max_covering_tombstone_seq = + iter->get_context->max_covering_tombstone_seq(); + *max_covering_tombstone_seq = + std::max(*max_covering_tombstone_seq, + range_del_iter->MaxCoveringTombstoneSeqnum(iter->ukey)); + } + } + } + if (s.ok()) { + t->MultiGet(options, &table_range, prefix_extractor, skip_filters); + } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) { + for (auto iter = table_range.begin(); iter != table_range.end(); ++iter) { + Status* status = iter->s; + if (status->IsIncomplete()) { + // Couldn't find Table in cache but treat as kFound if no_io set + iter->get_context->MarkKeyMayExist(); + s = Status::OK(); + } + } + } + } + +#ifndef ROCKSDB_LITE + if (lookup_row_cache) { + size_t row_idx = 0; + + for (auto miter = table_range.begin(); miter != table_range.end(); + ++miter) { + std::string& row_cache_entry = row_cache_entries[row_idx++]; + const Slice& user_key = miter->ukey; + ; + GetContext* get_context = miter->get_context; + + get_context->SetReplayLog(nullptr); + // Compute row cache key. + row_cache_key.TrimAppend(row_cache_key_prefix_size, user_key.data(), + user_key.size()); + // Put the replay log in row cache only if something was found. + if (s.ok() && !row_cache_entry.empty()) { + size_t charge = + row_cache_key.Size() + row_cache_entry.size() + sizeof(std::string); + void* row_ptr = new std::string(std::move(row_cache_entry)); + ioptions_.row_cache->Insert(row_cache_key.GetUserKey(), row_ptr, charge, + &DeleteEntry); + } + } + } +#endif // ROCKSDB_LITE + + if (handle != nullptr) { + ReleaseHandle(handle); + } + return s; +} + +Status TableCache::GetTableProperties( + const FileOptions& file_options, + const InternalKeyComparator& internal_comparator, const FileDescriptor& fd, + std::shared_ptr* properties, + const SliceTransform* prefix_extractor, bool no_io) { + Status s; + auto table_reader = fd.table_reader; + // table already been pre-loaded? + if (table_reader) { + *properties = table_reader->GetTableProperties(); + + return s; + } + + Cache::Handle* table_handle = nullptr; + s = FindTable(file_options, internal_comparator, fd, &table_handle, + prefix_extractor, no_io); + if (!s.ok()) { + return s; + } + assert(table_handle); + auto table = GetTableReaderFromHandle(table_handle); + *properties = table->GetTableProperties(); + ReleaseHandle(table_handle); + return s; +} + +size_t TableCache::GetMemoryUsageByTableReader( + const FileOptions& file_options, + const InternalKeyComparator& internal_comparator, const FileDescriptor& fd, + const SliceTransform* prefix_extractor) { + Status s; + auto table_reader = fd.table_reader; + // table already been pre-loaded? + if (table_reader) { + return table_reader->ApproximateMemoryUsage(); + } + + Cache::Handle* table_handle = nullptr; + s = FindTable(file_options, internal_comparator, fd, &table_handle, + prefix_extractor, true); + if (!s.ok()) { + return 0; + } + assert(table_handle); + auto table = GetTableReaderFromHandle(table_handle); + auto ret = table->ApproximateMemoryUsage(); + ReleaseHandle(table_handle); + return ret; +} + +void TableCache::Evict(Cache* cache, uint64_t file_number) { + cache->Erase(GetSliceForFileNumber(&file_number)); +} + +uint64_t TableCache::ApproximateOffsetOf( + const Slice& key, const FileDescriptor& fd, TableReaderCaller caller, + const InternalKeyComparator& internal_comparator, + const SliceTransform* prefix_extractor) { + uint64_t result = 0; + TableReader* table_reader = fd.table_reader; + Cache::Handle* table_handle = nullptr; + if (table_reader == nullptr) { + const bool for_compaction = (caller == TableReaderCaller::kCompaction); + Status s = FindTable(file_options_, internal_comparator, fd, &table_handle, + prefix_extractor, false /* no_io */, + !for_compaction /* record_read_stats */); + if (s.ok()) { + table_reader = GetTableReaderFromHandle(table_handle); + } + } + + if (table_reader != nullptr) { + result = table_reader->ApproximateOffsetOf(key, caller); + } + if (table_handle != nullptr) { + ReleaseHandle(table_handle); + } + + return result; +} + +uint64_t TableCache::ApproximateSize( + const Slice& start, const Slice& end, const FileDescriptor& fd, + TableReaderCaller caller, const InternalKeyComparator& internal_comparator, + const SliceTransform* prefix_extractor) { + uint64_t result = 0; + TableReader* table_reader = fd.table_reader; + Cache::Handle* table_handle = nullptr; + if (table_reader == nullptr) { + const bool for_compaction = (caller == TableReaderCaller::kCompaction); + Status s = FindTable(file_options_, internal_comparator, fd, &table_handle, + prefix_extractor, false /* no_io */, + !for_compaction /* record_read_stats */); + if (s.ok()) { + table_reader = GetTableReaderFromHandle(table_handle); + } + } + + if (table_reader != nullptr) { + result = table_reader->ApproximateSize(start, end, caller); + } + if (table_handle != nullptr) { + ReleaseHandle(table_handle); + } + + return result; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/table_cache.h b/src/rocksdb/db/table_cache.h new file mode 100644 index 000000000..b9de824ee --- /dev/null +++ b/src/rocksdb/db/table_cache.h @@ -0,0 +1,226 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Thread-safe (provides internal synchronization) + +#pragma once +#include +#include +#include + +#include "db/dbformat.h" +#include "db/range_del_aggregator.h" +#include "options/cf_options.h" +#include "port/port.h" +#include "rocksdb/cache.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" +#include "table/table_reader.h" +#include "trace_replay/block_cache_tracer.h" + +namespace ROCKSDB_NAMESPACE { + +class Env; +class Arena; +struct FileDescriptor; +class GetContext; +class HistogramImpl; + +// Manages caching for TableReader objects for a column family. The actual +// cache is allocated separately and passed to the constructor. TableCache +// wraps around the underlying SST file readers by providing Get(), +// MultiGet() and NewIterator() methods that hide the instantiation, +// caching and access to the TableReader. The main purpose of this is +// performance - by caching the TableReader, it avoids unnecessary file opens +// and object allocation and instantiation. One exception is compaction, where +// a new TableReader may be instantiated - see NewIterator() comments +// +// Another service provided by TableCache is managing the row cache - if the +// DB is configured with a row cache, and the lookup key is present in the row +// cache, lookup is very fast. The row cache is obtained from +// ioptions.row_cache +class TableCache { + public: + TableCache(const ImmutableCFOptions& ioptions, + const FileOptions& storage_options, Cache* cache, + BlockCacheTracer* const block_cache_tracer); + ~TableCache(); + + // Return an iterator for the specified file number (the corresponding + // file length must be exactly "file_size" bytes). If "table_reader_ptr" + // is non-nullptr, also sets "*table_reader_ptr" to point to the Table object + // underlying the returned iterator, or nullptr if no Table object underlies + // the returned iterator. The returned "*table_reader_ptr" object is owned + // by the cache and should not be deleted, and is valid for as long as the + // returned iterator is live. + // @param range_del_agg If non-nullptr, adds range deletions to the + // aggregator. If an error occurs, returns it in a NewErrorInternalIterator + // @param for_compaction If true, a new TableReader may be allocated (but + // not cached), depending on the CF options + // @param skip_filters Disables loading/accessing the filter block + // @param level The level this table is at, -1 for "not set / don't know" + InternalIterator* NewIterator( + const ReadOptions& options, const FileOptions& toptions, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, RangeDelAggregator* range_del_agg, + const SliceTransform* prefix_extractor, TableReader** table_reader_ptr, + HistogramImpl* file_read_hist, TableReaderCaller caller, Arena* arena, + bool skip_filters, int level, const InternalKey* smallest_compaction_key, + const InternalKey* largest_compaction_key); + + // If a seek to internal key "k" in specified file finds an entry, + // call get_context->SaveValue() repeatedly until + // it returns false. As a side effect, it will insert the TableReader + // into the cache and potentially evict another entry + // @param get_context Context for get operation. The result of the lookup + // can be retrieved by calling get_context->State() + // @param file_read_hist If non-nullptr, the file reader statistics are + // recorded + // @param skip_filters Disables loading/accessing the filter block + // @param level The level this table is at, -1 for "not set / don't know" + Status Get(const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, const Slice& k, + GetContext* get_context, + const SliceTransform* prefix_extractor = nullptr, + HistogramImpl* file_read_hist = nullptr, bool skip_filters = false, + int level = -1); + + // Return the range delete tombstone iterator of the file specified by + // `file_meta`. + Status GetRangeTombstoneIterator( + const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, + std::unique_ptr* out_iter); + + // If a seek to internal key "k" in specified file finds an entry, + // call get_context->SaveValue() repeatedly until + // it returns false. As a side effect, it will insert the TableReader + // into the cache and potentially evict another entry + // @param mget_range Pointer to the structure describing a batch of keys to + // be looked up in this table file. The result is stored + // in the embedded GetContext + // @param skip_filters Disables loading/accessing the filter block + // @param level The level this table is at, -1 for "not set / don't know" + Status MultiGet(const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, + const MultiGetContext::Range* mget_range, + const SliceTransform* prefix_extractor = nullptr, + HistogramImpl* file_read_hist = nullptr, + bool skip_filters = false, int level = -1); + + // Evict any entry for the specified file number + static void Evict(Cache* cache, uint64_t file_number); + + // Clean table handle and erase it from the table cache + // Used in DB close, or the file is not live anymore. + void EraseHandle(const FileDescriptor& fd, Cache::Handle* handle); + + // Find table reader + // @param skip_filters Disables loading/accessing the filter block + // @param level == -1 means not specified + Status FindTable(const FileOptions& toptions, + const InternalKeyComparator& internal_comparator, + const FileDescriptor& file_fd, Cache::Handle**, + const SliceTransform* prefix_extractor = nullptr, + const bool no_io = false, bool record_read_stats = true, + HistogramImpl* file_read_hist = nullptr, + bool skip_filters = false, int level = -1, + bool prefetch_index_and_filter_in_cache = true); + + // Get TableReader from a cache handle. + TableReader* GetTableReaderFromHandle(Cache::Handle* handle); + + // Get the table properties of a given table. + // @no_io: indicates if we should load table to the cache if it is not present + // in table cache yet. + // @returns: `properties` will be reset on success. Please note that we will + // return Status::Incomplete() if table is not present in cache and + // we set `no_io` to be true. + Status GetTableProperties(const FileOptions& toptions, + const InternalKeyComparator& internal_comparator, + const FileDescriptor& file_meta, + std::shared_ptr* properties, + const SliceTransform* prefix_extractor = nullptr, + bool no_io = false); + + // Return total memory usage of the table reader of the file. + // 0 if table reader of the file is not loaded. + size_t GetMemoryUsageByTableReader( + const FileOptions& toptions, + const InternalKeyComparator& internal_comparator, + const FileDescriptor& fd, + const SliceTransform* prefix_extractor = nullptr); + + // Returns approximated offset of a key in a file represented by fd. + uint64_t ApproximateOffsetOf( + const Slice& key, const FileDescriptor& fd, TableReaderCaller caller, + const InternalKeyComparator& internal_comparator, + const SliceTransform* prefix_extractor = nullptr); + + // Returns approximated data size between start and end keys in a file + // represented by fd (the start key must not be greater than the end key). + uint64_t ApproximateSize(const Slice& start, const Slice& end, + const FileDescriptor& fd, TableReaderCaller caller, + const InternalKeyComparator& internal_comparator, + const SliceTransform* prefix_extractor = nullptr); + + // Release the handle from a cache + void ReleaseHandle(Cache::Handle* handle); + + Cache* get_cache() const { return cache_; } + + // Capacity of the backing Cache that indicates inifinite TableCache capacity. + // For example when max_open_files is -1 we set the backing Cache to this. + static const int kInfiniteCapacity = 0x400000; + + // The tables opened with this TableCache will be immortal, i.e., their + // lifetime is as long as that of the DB. + void SetTablesAreImmortal() { + if (cache_->GetCapacity() >= kInfiniteCapacity) { + immortal_tables_ = true; + } + } + + private: + // Build a table reader + Status GetTableReader(const FileOptions& file_options, + const InternalKeyComparator& internal_comparator, + const FileDescriptor& fd, bool sequential_mode, + bool record_read_stats, HistogramImpl* file_read_hist, + std::unique_ptr* table_reader, + const SliceTransform* prefix_extractor = nullptr, + bool skip_filters = false, int level = -1, + bool prefetch_index_and_filter_in_cache = true); + + // Create a key prefix for looking up the row cache. The prefix is of the + // format row_cache_id + fd_number + seq_no. Later, the user key can be + // appended to form the full key + void CreateRowCacheKeyPrefix(const ReadOptions& options, + const FileDescriptor& fd, + const Slice& internal_key, + GetContext* get_context, IterKey& row_cache_key); + + // Helper function to lookup the row cache for a key. It appends the + // user key to row_cache_key at offset prefix_size + bool GetFromRowCache(const Slice& user_key, IterKey& row_cache_key, + size_t prefix_size, GetContext* get_context); + + const ImmutableCFOptions& ioptions_; + const FileOptions& file_options_; + Cache* const cache_; + std::string row_cache_id_; + bool immortal_tables_; + BlockCacheTracer* const block_cache_tracer_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/table_properties_collector.cc b/src/rocksdb/db/table_properties_collector.cc new file mode 100644 index 000000000..d98ff5e9b --- /dev/null +++ b/src/rocksdb/db/table_properties_collector.cc @@ -0,0 +1,74 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/table_properties_collector.h" + +#include "db/dbformat.h" +#include "util/coding.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +uint64_t GetUint64Property(const UserCollectedProperties& props, + const std::string& property_name, + bool* property_present) { + auto pos = props.find(property_name); + if (pos == props.end()) { + *property_present = false; + return 0; + } + Slice raw = pos->second; + uint64_t val = 0; + *property_present = true; + return GetVarint64(&raw, &val) ? val : 0; +} + +} // namespace + +Status UserKeyTablePropertiesCollector::InternalAdd(const Slice& key, + const Slice& value, + uint64_t file_size) { + ParsedInternalKey ikey; + if (!ParseInternalKey(key, &ikey)) { + return Status::InvalidArgument("Invalid internal key"); + } + + return collector_->AddUserKey(ikey.user_key, value, GetEntryType(ikey.type), + ikey.sequence, file_size); +} + +void UserKeyTablePropertiesCollector::BlockAdd( + uint64_t bLockRawBytes, uint64_t blockCompressedBytesFast, + uint64_t blockCompressedBytesSlow) { + return collector_->BlockAdd(bLockRawBytes, blockCompressedBytesFast, + blockCompressedBytesSlow); +} + +Status UserKeyTablePropertiesCollector::Finish( + UserCollectedProperties* properties) { + return collector_->Finish(properties); +} + +UserCollectedProperties +UserKeyTablePropertiesCollector::GetReadableProperties() const { + return collector_->GetReadableProperties(); +} + +uint64_t GetDeletedKeys( + const UserCollectedProperties& props) { + bool property_present_ignored; + return GetUint64Property(props, TablePropertiesNames::kDeletedKeys, + &property_present_ignored); +} + +uint64_t GetMergeOperands(const UserCollectedProperties& props, + bool* property_present) { + return GetUint64Property( + props, TablePropertiesNames::kMergeOperands, property_present); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/table_properties_collector.h b/src/rocksdb/db/table_properties_collector.h new file mode 100644 index 000000000..130eb64d4 --- /dev/null +++ b/src/rocksdb/db/table_properties_collector.h @@ -0,0 +1,107 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file defines a collection of statistics collectors. +#pragma once + +#include "rocksdb/table_properties.h" + +#include +#include +#include + +namespace ROCKSDB_NAMESPACE { + +// Base class for internal table properties collector. +class IntTblPropCollector { + public: + virtual ~IntTblPropCollector() {} + virtual Status Finish(UserCollectedProperties* properties) = 0; + + virtual const char* Name() const = 0; + + // @params key the user key that is inserted into the table. + // @params value the value that is inserted into the table. + virtual Status InternalAdd(const Slice& key, const Slice& value, + uint64_t file_size) = 0; + + virtual void BlockAdd(uint64_t blockRawBytes, + uint64_t blockCompressedBytesFast, + uint64_t blockCompressedBytesSlow) = 0; + + virtual UserCollectedProperties GetReadableProperties() const = 0; + + virtual bool NeedCompact() const { return false; } +}; + +// Factory for internal table properties collector. +class IntTblPropCollectorFactory { + public: + virtual ~IntTblPropCollectorFactory() {} + // has to be thread-safe + virtual IntTblPropCollector* CreateIntTblPropCollector( + uint32_t column_family_id) = 0; + + // The name of the properties collector can be used for debugging purpose. + virtual const char* Name() const = 0; +}; + +// When rocksdb creates a new table, it will encode all "user keys" into +// "internal keys", which contains meta information of a given entry. +// +// This class extracts user key from the encoded internal key when Add() is +// invoked. +class UserKeyTablePropertiesCollector : public IntTblPropCollector { + public: + // transfer of ownership + explicit UserKeyTablePropertiesCollector(TablePropertiesCollector* collector) + : collector_(collector) {} + + virtual ~UserKeyTablePropertiesCollector() {} + + virtual Status InternalAdd(const Slice& key, const Slice& value, + uint64_t file_size) override; + + virtual void BlockAdd(uint64_t blockRawBytes, + uint64_t blockCompressedBytesFast, + uint64_t blockCompressedBytesSlow) override; + + virtual Status Finish(UserCollectedProperties* properties) override; + + virtual const char* Name() const override { return collector_->Name(); } + + UserCollectedProperties GetReadableProperties() const override; + + virtual bool NeedCompact() const override { + return collector_->NeedCompact(); + } + + protected: + std::unique_ptr collector_; +}; + +class UserKeyTablePropertiesCollectorFactory + : public IntTblPropCollectorFactory { + public: + explicit UserKeyTablePropertiesCollectorFactory( + std::shared_ptr user_collector_factory) + : user_collector_factory_(user_collector_factory) {} + virtual IntTblPropCollector* CreateIntTblPropCollector( + uint32_t column_family_id) override { + TablePropertiesCollectorFactory::Context context; + context.column_family_id = column_family_id; + return new UserKeyTablePropertiesCollector( + user_collector_factory_->CreateTablePropertiesCollector(context)); + } + + virtual const char* Name() const override { + return user_collector_factory_->Name(); + } + + private: + std::shared_ptr user_collector_factory_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/table_properties_collector_test.cc b/src/rocksdb/db/table_properties_collector_test.cc new file mode 100644 index 000000000..5c202de81 --- /dev/null +++ b/src/rocksdb/db/table_properties_collector_test.cc @@ -0,0 +1,515 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include +#include +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "db/dbformat.h" +#include "db/table_properties_collector.h" +#include "env/composite_env_wrapper.h" +#include "file/sequence_file_reader.h" +#include "file/writable_file_writer.h" +#include "options/cf_options.h" +#include "rocksdb/table.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/meta_blocks.h" +#include "table/plain/plain_table_factory.h" +#include "table/table_builder.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +class TablePropertiesTest : public testing::Test, + public testing::WithParamInterface { + public: + void SetUp() override { backward_mode_ = GetParam(); } + + bool backward_mode_; +}; + +// Utilities test functions +namespace { +static const uint32_t kTestColumnFamilyId = 66; +static const std::string kTestColumnFamilyName = "test_column_fam"; + +void MakeBuilder(const Options& options, const ImmutableCFOptions& ioptions, + const MutableCFOptions& moptions, + const InternalKeyComparator& internal_comparator, + const std::vector>* + int_tbl_prop_collector_factories, + std::unique_ptr* writable, + std::unique_ptr* builder) { + std::unique_ptr wf(new test::StringSink); + writable->reset( + new WritableFileWriter(NewLegacyWritableFileWrapper(std::move(wf)), + "" /* don't care */, EnvOptions())); + int unknown_level = -1; + builder->reset(NewTableBuilder( + ioptions, moptions, internal_comparator, int_tbl_prop_collector_factories, + kTestColumnFamilyId, kTestColumnFamilyName, writable->get(), + options.compression, options.sample_for_compression, + options.compression_opts, unknown_level)); +} +} // namespace + +// Collects keys that starts with "A" in a table. +class RegularKeysStartWithA: public TablePropertiesCollector { + public: + const char* Name() const override { return "RegularKeysStartWithA"; } + + Status Finish(UserCollectedProperties* properties) override { + std::string encoded; + std::string encoded_num_puts; + std::string encoded_num_deletes; + std::string encoded_num_single_deletes; + std::string encoded_num_size_changes; + PutVarint32(&encoded, count_); + PutVarint32(&encoded_num_puts, num_puts_); + PutVarint32(&encoded_num_deletes, num_deletes_); + PutVarint32(&encoded_num_single_deletes, num_single_deletes_); + PutVarint32(&encoded_num_size_changes, num_size_changes_); + *properties = UserCollectedProperties{ + {"TablePropertiesTest", message_}, + {"Count", encoded}, + {"NumPuts", encoded_num_puts}, + {"NumDeletes", encoded_num_deletes}, + {"NumSingleDeletes", encoded_num_single_deletes}, + {"NumSizeChanges", encoded_num_size_changes}, + }; + return Status::OK(); + } + + Status AddUserKey(const Slice& user_key, const Slice& /*value*/, + EntryType type, SequenceNumber /*seq*/, + uint64_t file_size) override { + // simply asssume all user keys are not empty. + if (user_key.data()[0] == 'A') { + ++count_; + } + if (type == kEntryPut) { + num_puts_++; + } else if (type == kEntryDelete) { + num_deletes_++; + } else if (type == kEntrySingleDelete) { + num_single_deletes_++; + } + if (file_size < file_size_) { + message_ = "File size should not decrease."; + } else if (file_size != file_size_) { + num_size_changes_++; + } + + return Status::OK(); + } + + UserCollectedProperties GetReadableProperties() const override { + return UserCollectedProperties{}; + } + + private: + std::string message_ = "Rocksdb"; + uint32_t count_ = 0; + uint32_t num_puts_ = 0; + uint32_t num_deletes_ = 0; + uint32_t num_single_deletes_ = 0; + uint32_t num_size_changes_ = 0; + uint64_t file_size_ = 0; +}; + +// Collects keys that starts with "A" in a table. Backward compatible mode +// It is also used to test internal key table property collector +class RegularKeysStartWithABackwardCompatible + : public TablePropertiesCollector { + public: + const char* Name() const override { return "RegularKeysStartWithA"; } + + Status Finish(UserCollectedProperties* properties) override { + std::string encoded; + PutVarint32(&encoded, count_); + *properties = UserCollectedProperties{{"TablePropertiesTest", "Rocksdb"}, + {"Count", encoded}}; + return Status::OK(); + } + + Status Add(const Slice& user_key, const Slice& /*value*/) override { + // simply asssume all user keys are not empty. + if (user_key.data()[0] == 'A') { + ++count_; + } + return Status::OK(); + } + + UserCollectedProperties GetReadableProperties() const override { + return UserCollectedProperties{}; + } + + private: + uint32_t count_ = 0; +}; + +class RegularKeysStartWithAInternal : public IntTblPropCollector { + public: + const char* Name() const override { return "RegularKeysStartWithA"; } + + Status Finish(UserCollectedProperties* properties) override { + std::string encoded; + PutVarint32(&encoded, count_); + *properties = UserCollectedProperties{{"TablePropertiesTest", "Rocksdb"}, + {"Count", encoded}}; + return Status::OK(); + } + + Status InternalAdd(const Slice& user_key, const Slice& /*value*/, + uint64_t /*file_size*/) override { + // simply asssume all user keys are not empty. + if (user_key.data()[0] == 'A') { + ++count_; + } + return Status::OK(); + } + + void BlockAdd(uint64_t /* blockRawBytes */, + uint64_t /* blockCompressedBytesFast */, + uint64_t /* blockCompressedBytesSlow */) override { + // Nothing to do. + return; + } + + UserCollectedProperties GetReadableProperties() const override { + return UserCollectedProperties{}; + } + + private: + uint32_t count_ = 0; +}; + +class RegularKeysStartWithAFactory : public IntTblPropCollectorFactory, + public TablePropertiesCollectorFactory { + public: + explicit RegularKeysStartWithAFactory(bool backward_mode) + : backward_mode_(backward_mode) {} + TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context context) override { + EXPECT_EQ(kTestColumnFamilyId, context.column_family_id); + if (!backward_mode_) { + return new RegularKeysStartWithA(); + } else { + return new RegularKeysStartWithABackwardCompatible(); + } + } + IntTblPropCollector* CreateIntTblPropCollector( + uint32_t /*column_family_id*/) override { + return new RegularKeysStartWithAInternal(); + } + const char* Name() const override { return "RegularKeysStartWithA"; } + + bool backward_mode_; +}; + +class FlushBlockEveryThreePolicy : public FlushBlockPolicy { + public: + bool Update(const Slice& /*key*/, const Slice& /*value*/) override { + return (++count_ % 3U == 0); + } + + private: + uint64_t count_ = 0; +}; + +class FlushBlockEveryThreePolicyFactory : public FlushBlockPolicyFactory { + public: + explicit FlushBlockEveryThreePolicyFactory() {} + + const char* Name() const override { + return "FlushBlockEveryThreePolicyFactory"; + } + + FlushBlockPolicy* NewFlushBlockPolicy( + const BlockBasedTableOptions& /*table_options*/, + const BlockBuilder& /*data_block_builder*/) const override { + return new FlushBlockEveryThreePolicy; + } +}; + +extern const uint64_t kBlockBasedTableMagicNumber; +extern const uint64_t kPlainTableMagicNumber; +namespace { +void TestCustomizedTablePropertiesCollector( + bool backward_mode, uint64_t magic_number, bool test_int_tbl_prop_collector, + const Options& options, const InternalKeyComparator& internal_comparator) { + // make sure the entries will be inserted with order. + std::map, std::string> kvs = { + {{"About ", kTypeValue}, "val5"}, // starts with 'A' + {{"Abstract", kTypeValue}, "val2"}, // starts with 'A' + {{"Around ", kTypeValue}, "val7"}, // starts with 'A' + {{"Beyond ", kTypeValue}, "val3"}, + {{"Builder ", kTypeValue}, "val1"}, + {{"Love ", kTypeDeletion}, ""}, + {{"Cancel ", kTypeValue}, "val4"}, + {{"Find ", kTypeValue}, "val6"}, + {{"Rocks ", kTypeDeletion}, ""}, + {{"Foo ", kTypeSingleDeletion}, ""}, + }; + + // -- Step 1: build table + std::unique_ptr builder; + std::unique_ptr writer; + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + std::vector> + int_tbl_prop_collector_factories; + if (test_int_tbl_prop_collector) { + int_tbl_prop_collector_factories.emplace_back( + new RegularKeysStartWithAFactory(backward_mode)); + } else { + GetIntTblPropCollectorFactory(ioptions, &int_tbl_prop_collector_factories); + } + MakeBuilder(options, ioptions, moptions, internal_comparator, + &int_tbl_prop_collector_factories, &writer, &builder); + + SequenceNumber seqNum = 0U; + for (const auto& kv : kvs) { + InternalKey ikey(kv.first.first, seqNum++, kv.first.second); + builder->Add(ikey.Encode(), kv.second); + } + ASSERT_OK(builder->Finish()); + writer->Flush(); + + // -- Step 2: Read properties + LegacyWritableFileWrapper* file = + static_cast(writer->writable_file()); + test::StringSink* fwf = static_cast(file->target()); + std::unique_ptr fake_file_reader( + test::GetRandomAccessFileReader( + new test::StringSource(fwf->contents()))); + TableProperties* props; + Status s = ReadTableProperties(fake_file_reader.get(), fwf->contents().size(), + magic_number, ioptions, &props, + true /* compression_type_missing */); + std::unique_ptr props_guard(props); + ASSERT_OK(s); + + auto user_collected = props->user_collected_properties; + + ASSERT_NE(user_collected.find("TablePropertiesTest"), user_collected.end()); + ASSERT_EQ("Rocksdb", user_collected.at("TablePropertiesTest")); + + uint32_t starts_with_A = 0; + ASSERT_NE(user_collected.find("Count"), user_collected.end()); + Slice key(user_collected.at("Count")); + ASSERT_TRUE(GetVarint32(&key, &starts_with_A)); + ASSERT_EQ(3u, starts_with_A); + + if (!backward_mode && !test_int_tbl_prop_collector) { + uint32_t num_puts; + ASSERT_NE(user_collected.find("NumPuts"), user_collected.end()); + Slice key_puts(user_collected.at("NumPuts")); + ASSERT_TRUE(GetVarint32(&key_puts, &num_puts)); + ASSERT_EQ(7u, num_puts); + + uint32_t num_deletes; + ASSERT_NE(user_collected.find("NumDeletes"), user_collected.end()); + Slice key_deletes(user_collected.at("NumDeletes")); + ASSERT_TRUE(GetVarint32(&key_deletes, &num_deletes)); + ASSERT_EQ(2u, num_deletes); + + uint32_t num_single_deletes; + ASSERT_NE(user_collected.find("NumSingleDeletes"), user_collected.end()); + Slice key_single_deletes(user_collected.at("NumSingleDeletes")); + ASSERT_TRUE(GetVarint32(&key_single_deletes, &num_single_deletes)); + ASSERT_EQ(1u, num_single_deletes); + + uint32_t num_size_changes; + ASSERT_NE(user_collected.find("NumSizeChanges"), user_collected.end()); + Slice key_size_changes(user_collected.at("NumSizeChanges")); + ASSERT_TRUE(GetVarint32(&key_size_changes, &num_size_changes)); + ASSERT_GE(num_size_changes, 2u); + } +} +} // namespace + +TEST_P(TablePropertiesTest, CustomizedTablePropertiesCollector) { + // Test properties collectors with internal keys or regular keys + // for block based table + for (bool encode_as_internal : { true, false }) { + Options options; + BlockBasedTableOptions table_options; + table_options.flush_block_policy_factory = + std::make_shared(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + test::PlainInternalKeyComparator ikc(options.comparator); + std::shared_ptr collector_factory( + new RegularKeysStartWithAFactory(backward_mode_)); + options.table_properties_collector_factories.resize(1); + options.table_properties_collector_factories[0] = collector_factory; + + TestCustomizedTablePropertiesCollector(backward_mode_, + kBlockBasedTableMagicNumber, + encode_as_internal, options, ikc); + +#ifndef ROCKSDB_LITE // PlainTable is not supported in Lite + // test plain table + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = 8; + plain_table_options.bloom_bits_per_key = 8; + plain_table_options.hash_table_ratio = 0; + + options.table_factory = + std::make_shared(plain_table_options); + TestCustomizedTablePropertiesCollector(backward_mode_, + kPlainTableMagicNumber, + encode_as_internal, options, ikc); +#endif // !ROCKSDB_LITE + } +} + +namespace { +void TestInternalKeyPropertiesCollector( + bool backward_mode, uint64_t magic_number, bool sanitized, + std::shared_ptr table_factory) { + InternalKey keys[] = { + InternalKey("A ", 0, ValueType::kTypeValue), + InternalKey("B ", 1, ValueType::kTypeValue), + InternalKey("C ", 2, ValueType::kTypeValue), + InternalKey("W ", 3, ValueType::kTypeDeletion), + InternalKey("X ", 4, ValueType::kTypeDeletion), + InternalKey("Y ", 5, ValueType::kTypeDeletion), + InternalKey("Z ", 6, ValueType::kTypeDeletion), + InternalKey("a ", 7, ValueType::kTypeSingleDeletion), + InternalKey("b ", 8, ValueType::kTypeMerge), + InternalKey("c ", 9, ValueType::kTypeMerge), + }; + + std::unique_ptr builder; + std::unique_ptr writable; + Options options; + test::PlainInternalKeyComparator pikc(options.comparator); + + std::vector> + int_tbl_prop_collector_factories; + options.table_factory = table_factory; + if (sanitized) { + options.table_properties_collector_factories.emplace_back( + new RegularKeysStartWithAFactory(backward_mode)); + // with sanitization, even regular properties collector will be able to + // handle internal keys. + auto comparator = options.comparator; + // HACK: Set options.info_log to avoid writing log in + // SanitizeOptions(). + options.info_log = std::make_shared(); + options = SanitizeOptions("db", // just a place holder + options); + ImmutableCFOptions ioptions(options); + GetIntTblPropCollectorFactory(ioptions, &int_tbl_prop_collector_factories); + options.comparator = comparator; + } + const ImmutableCFOptions ioptions(options); + MutableCFOptions moptions(options); + + for (int iter = 0; iter < 2; ++iter) { + MakeBuilder(options, ioptions, moptions, pikc, + &int_tbl_prop_collector_factories, &writable, &builder); + for (const auto& k : keys) { + builder->Add(k.Encode(), "val"); + } + + ASSERT_OK(builder->Finish()); + writable->Flush(); + + LegacyWritableFileWrapper* file = + static_cast(writable->writable_file()); + test::StringSink* fwf = static_cast(file->target()); + std::unique_ptr reader( + test::GetRandomAccessFileReader( + new test::StringSource(fwf->contents()))); + TableProperties* props; + Status s = + ReadTableProperties(reader.get(), fwf->contents().size(), magic_number, + ioptions, &props, true /* compression_type_missing */); + ASSERT_OK(s); + + std::unique_ptr props_guard(props); + auto user_collected = props->user_collected_properties; + uint64_t deleted = GetDeletedKeys(user_collected); + ASSERT_EQ(5u, deleted); // deletes + single-deletes + + bool property_present; + uint64_t merges = GetMergeOperands(user_collected, &property_present); + ASSERT_TRUE(property_present); + ASSERT_EQ(2u, merges); + + if (sanitized) { + uint32_t starts_with_A = 0; + ASSERT_NE(user_collected.find("Count"), user_collected.end()); + Slice key(user_collected.at("Count")); + ASSERT_TRUE(GetVarint32(&key, &starts_with_A)); + ASSERT_EQ(1u, starts_with_A); + + if (!backward_mode) { + uint32_t num_puts; + ASSERT_NE(user_collected.find("NumPuts"), user_collected.end()); + Slice key_puts(user_collected.at("NumPuts")); + ASSERT_TRUE(GetVarint32(&key_puts, &num_puts)); + ASSERT_EQ(3u, num_puts); + + uint32_t num_deletes; + ASSERT_NE(user_collected.find("NumDeletes"), user_collected.end()); + Slice key_deletes(user_collected.at("NumDeletes")); + ASSERT_TRUE(GetVarint32(&key_deletes, &num_deletes)); + ASSERT_EQ(4u, num_deletes); + + uint32_t num_single_deletes; + ASSERT_NE(user_collected.find("NumSingleDeletes"), + user_collected.end()); + Slice key_single_deletes(user_collected.at("NumSingleDeletes")); + ASSERT_TRUE(GetVarint32(&key_single_deletes, &num_single_deletes)); + ASSERT_EQ(1u, num_single_deletes); + } + } + } +} +} // namespace + +TEST_P(TablePropertiesTest, InternalKeyPropertiesCollector) { + TestInternalKeyPropertiesCollector( + backward_mode_, kBlockBasedTableMagicNumber, true /* sanitize */, + std::make_shared()); + if (backward_mode_) { + TestInternalKeyPropertiesCollector( + backward_mode_, kBlockBasedTableMagicNumber, false /* not sanitize */, + std::make_shared()); + } + +#ifndef ROCKSDB_LITE // PlainTable is not supported in Lite + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = 8; + plain_table_options.bloom_bits_per_key = 8; + plain_table_options.hash_table_ratio = 0; + + TestInternalKeyPropertiesCollector( + backward_mode_, kPlainTableMagicNumber, false /* not sanitize */, + std::make_shared(plain_table_options)); +#endif // !ROCKSDB_LITE +} + +INSTANTIATE_TEST_CASE_P(InternalKeyPropertiesCollector, TablePropertiesTest, + ::testing::Bool()); + +INSTANTIATE_TEST_CASE_P(CustomizedTablePropertiesCollector, TablePropertiesTest, + ::testing::Bool()); + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/transaction_log_impl.cc b/src/rocksdb/db/transaction_log_impl.cc new file mode 100644 index 000000000..56bc161a3 --- /dev/null +++ b/src/rocksdb/db/transaction_log_impl.cc @@ -0,0 +1,315 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "db/transaction_log_impl.h" +#include +#include "db/write_batch_internal.h" +#include "file/sequence_file_reader.h" + +namespace ROCKSDB_NAMESPACE { + +TransactionLogIteratorImpl::TransactionLogIteratorImpl( + const std::string& dir, const ImmutableDBOptions* options, + const TransactionLogIterator::ReadOptions& read_options, + const EnvOptions& soptions, const SequenceNumber seq, + std::unique_ptr files, VersionSet const* const versions, + const bool seq_per_batch) + : dir_(dir), + options_(options), + read_options_(read_options), + soptions_(soptions), + starting_sequence_number_(seq), + files_(std::move(files)), + started_(false), + is_valid_(false), + current_file_index_(0), + current_batch_seq_(0), + current_last_seq_(0), + versions_(versions), + seq_per_batch_(seq_per_batch) { + assert(files_ != nullptr); + assert(versions_ != nullptr); + + reporter_.env = options_->env; + reporter_.info_log = options_->info_log.get(); + SeekToStartSequence(); // Seek till starting sequence +} + +Status TransactionLogIteratorImpl::OpenLogFile( + const LogFile* log_file, + std::unique_ptr* file_reader) { + FileSystem* fs = options_->fs.get(); + std::unique_ptr file; + std::string fname; + Status s; + EnvOptions optimized_env_options = fs->OptimizeForLogRead(soptions_); + if (log_file->Type() == kArchivedLogFile) { + fname = ArchivedLogFileName(dir_, log_file->LogNumber()); + s = fs->NewSequentialFile(fname, optimized_env_options, &file, nullptr); + } else { + fname = LogFileName(dir_, log_file->LogNumber()); + s = fs->NewSequentialFile(fname, optimized_env_options, &file, nullptr); + if (!s.ok()) { + // If cannot open file in DB directory. + // Try the archive dir, as it could have moved in the meanwhile. + fname = ArchivedLogFileName(dir_, log_file->LogNumber()); + s = fs->NewSequentialFile(fname, optimized_env_options, + &file, nullptr); + } + } + if (s.ok()) { + file_reader->reset(new SequentialFileReader(std::move(file), fname)); + } + return s; +} + +BatchResult TransactionLogIteratorImpl::GetBatch() { + assert(is_valid_); // cannot call in a non valid state. + BatchResult result; + result.sequence = current_batch_seq_; + result.writeBatchPtr = std::move(current_batch_); + return result; +} + +Status TransactionLogIteratorImpl::status() { return current_status_; } + +bool TransactionLogIteratorImpl::Valid() { return started_ && is_valid_; } + +bool TransactionLogIteratorImpl::RestrictedRead(Slice* record) { + // Don't read if no more complete entries to read from logs + if (current_last_seq_ >= versions_->LastSequence()) { + return false; + } + return current_log_reader_->ReadRecord(record, &scratch_); +} + +void TransactionLogIteratorImpl::SeekToStartSequence(uint64_t start_file_index, + bool strict) { + Slice record; + started_ = false; + is_valid_ = false; + if (files_->size() <= start_file_index) { + return; + } + Status s = + OpenLogReader(files_->at(static_cast(start_file_index)).get()); + if (!s.ok()) { + current_status_ = s; + reporter_.Info(current_status_.ToString().c_str()); + return; + } + while (RestrictedRead(&record)) { + if (record.size() < WriteBatchInternal::kHeader) { + reporter_.Corruption( + record.size(), Status::Corruption("very small log record")); + continue; + } + UpdateCurrentWriteBatch(record); + if (current_last_seq_ >= starting_sequence_number_) { + if (strict && current_batch_seq_ != starting_sequence_number_) { + current_status_ = Status::Corruption( + "Gap in sequence number. Could not " + "seek to required sequence number"); + reporter_.Info(current_status_.ToString().c_str()); + return; + } else if (strict) { + reporter_.Info("Could seek required sequence number. Iterator will " + "continue."); + } + is_valid_ = true; + started_ = true; // set started_ as we could seek till starting sequence + return; + } else { + is_valid_ = false; + } + } + + // Could not find start sequence in first file. Normally this must be the + // only file. Otherwise log the error and let the iterator return next entry + // If strict is set, we want to seek exactly till the start sequence and it + // should have been present in the file we scanned above + if (strict) { + current_status_ = Status::Corruption( + "Gap in sequence number. Could not " + "seek to required sequence number"); + reporter_.Info(current_status_.ToString().c_str()); + } else if (files_->size() != 1) { + current_status_ = Status::Corruption( + "Start sequence was not found, " + "skipping to the next available"); + reporter_.Info(current_status_.ToString().c_str()); + // Let NextImpl find the next available entry. started_ remains false + // because we don't want to check for gaps while moving to start sequence + NextImpl(true); + } +} + +void TransactionLogIteratorImpl::Next() { + return NextImpl(false); +} + +void TransactionLogIteratorImpl::NextImpl(bool internal) { + Slice record; + is_valid_ = false; + if (!internal && !started_) { + // Runs every time until we can seek to the start sequence + return SeekToStartSequence(); + } + while(true) { + assert(current_log_reader_); + if (current_log_reader_->IsEOF()) { + current_log_reader_->UnmarkEOF(); + } + while (RestrictedRead(&record)) { + if (record.size() < WriteBatchInternal::kHeader) { + reporter_.Corruption( + record.size(), Status::Corruption("very small log record")); + continue; + } else { + // started_ should be true if called by application + assert(internal || started_); + // started_ should be false if called internally + assert(!internal || !started_); + UpdateCurrentWriteBatch(record); + if (internal && !started_) { + started_ = true; + } + return; + } + } + + // Open the next file + if (current_file_index_ < files_->size() - 1) { + ++current_file_index_; + Status s = OpenLogReader(files_->at(current_file_index_).get()); + if (!s.ok()) { + is_valid_ = false; + current_status_ = s; + return; + } + } else { + is_valid_ = false; + if (current_last_seq_ == versions_->LastSequence()) { + current_status_ = Status::OK(); + } else { + const char* msg = "Create a new iterator to fetch the new tail."; + current_status_ = Status::TryAgain(msg); + } + return; + } + } +} + +bool TransactionLogIteratorImpl::IsBatchExpected( + const WriteBatch* batch, const SequenceNumber expected_seq) { + assert(batch); + SequenceNumber batchSeq = WriteBatchInternal::Sequence(batch); + if (batchSeq != expected_seq) { + char buf[200]; + snprintf(buf, sizeof(buf), + "Discontinuity in log records. Got seq=%" PRIu64 + ", Expected seq=%" PRIu64 ", Last flushed seq=%" PRIu64 + ".Log iterator will reseek the correct batch.", + batchSeq, expected_seq, versions_->LastSequence()); + reporter_.Info(buf); + return false; + } + return true; +} + +void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) { + std::unique_ptr batch(new WriteBatch()); + WriteBatchInternal::SetContents(batch.get(), record); + + SequenceNumber expected_seq = current_last_seq_ + 1; + // If the iterator has started, then confirm that we get continuous batches + if (started_ && !IsBatchExpected(batch.get(), expected_seq)) { + // Seek to the batch having expected sequence number + if (expected_seq < files_->at(current_file_index_)->StartSequence()) { + // Expected batch must lie in the previous log file + // Avoid underflow. + if (current_file_index_ != 0) { + current_file_index_--; + } + } + starting_sequence_number_ = expected_seq; + // currentStatus_ will be set to Ok if reseek succeeds + // Note: this is still ok in seq_pre_batch_ && two_write_queuesp_ mode + // that allows gaps in the WAL since it will still skip over the gap. + current_status_ = Status::NotFound("Gap in sequence numbers"); + // In seq_per_batch_ mode, gaps in the seq are possible so the strict mode + // should be disabled + return SeekToStartSequence(current_file_index_, !seq_per_batch_); + } + + struct BatchCounter : public WriteBatch::Handler { + SequenceNumber sequence_; + BatchCounter(SequenceNumber sequence) : sequence_(sequence) {} + Status MarkNoop(bool empty_batch) override { + if (!empty_batch) { + sequence_++; + } + return Status::OK(); + } + Status MarkEndPrepare(const Slice&) override { + sequence_++; + return Status::OK(); + } + Status MarkCommit(const Slice&) override { + sequence_++; + return Status::OK(); + } + + Status PutCF(uint32_t /*cf*/, const Slice& /*key*/, + const Slice& /*val*/) override { + return Status::OK(); + } + Status DeleteCF(uint32_t /*cf*/, const Slice& /*key*/) override { + return Status::OK(); + } + Status SingleDeleteCF(uint32_t /*cf*/, const Slice& /*key*/) override { + return Status::OK(); + } + Status MergeCF(uint32_t /*cf*/, const Slice& /*key*/, + const Slice& /*val*/) override { + return Status::OK(); + } + Status MarkBeginPrepare(bool) override { return Status::OK(); } + Status MarkRollback(const Slice&) override { return Status::OK(); } + }; + + current_batch_seq_ = WriteBatchInternal::Sequence(batch.get()); + if (seq_per_batch_) { + BatchCounter counter(current_batch_seq_); + batch->Iterate(&counter); + current_last_seq_ = counter.sequence_; + } else { + current_last_seq_ = + current_batch_seq_ + WriteBatchInternal::Count(batch.get()) - 1; + } + // currentBatchSeq_ can only change here + assert(current_last_seq_ <= versions_->LastSequence()); + + current_batch_ = std::move(batch); + is_valid_ = true; + current_status_ = Status::OK(); +} + +Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* log_file) { + std::unique_ptr file; + Status s = OpenLogFile(log_file, &file); + if (!s.ok()) { + return s; + } + assert(file); + current_log_reader_.reset( + new log::Reader(options_->info_log, std::move(file), &reporter_, + read_options_.verify_checksums_, log_file->LogNumber())); + return Status::OK(); +} +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/db/transaction_log_impl.h b/src/rocksdb/db/transaction_log_impl.h new file mode 100644 index 000000000..eb53daf2b --- /dev/null +++ b/src/rocksdb/db/transaction_log_impl.h @@ -0,0 +1,127 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#ifndef ROCKSDB_LITE +#include + +#include "db/log_reader.h" +#include "db/version_set.h" +#include "file/filename.h" +#include "options/db_options.h" +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/transaction_log.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +class LogFileImpl : public LogFile { + public: + LogFileImpl(uint64_t logNum, WalFileType logType, SequenceNumber startSeq, + uint64_t sizeBytes) : + logNumber_(logNum), + type_(logType), + startSequence_(startSeq), + sizeFileBytes_(sizeBytes) { + } + + std::string PathName() const override { + if (type_ == kArchivedLogFile) { + return ArchivedLogFileName("", logNumber_); + } + return LogFileName("", logNumber_); + } + + uint64_t LogNumber() const override { return logNumber_; } + + WalFileType Type() const override { return type_; } + + SequenceNumber StartSequence() const override { return startSequence_; } + + uint64_t SizeFileBytes() const override { return sizeFileBytes_; } + + bool operator < (const LogFile& that) const { + return LogNumber() < that.LogNumber(); + } + + private: + uint64_t logNumber_; + WalFileType type_; + SequenceNumber startSequence_; + uint64_t sizeFileBytes_; + +}; + +class TransactionLogIteratorImpl : public TransactionLogIterator { + public: + TransactionLogIteratorImpl( + const std::string& dir, const ImmutableDBOptions* options, + const TransactionLogIterator::ReadOptions& read_options, + const EnvOptions& soptions, const SequenceNumber seqNum, + std::unique_ptr files, VersionSet const* const versions, + const bool seq_per_batch); + + virtual bool Valid() override; + + virtual void Next() override; + + virtual Status status() override; + + virtual BatchResult GetBatch() override; + + private: + const std::string& dir_; + const ImmutableDBOptions* options_; + const TransactionLogIterator::ReadOptions read_options_; + const EnvOptions& soptions_; + SequenceNumber starting_sequence_number_; + std::unique_ptr files_; + bool started_; + bool is_valid_; // not valid when it starts of. + Status current_status_; + size_t current_file_index_; + std::unique_ptr current_batch_; + std::unique_ptr current_log_reader_; + std::string scratch_; + Status OpenLogFile(const LogFile* log_file, + std::unique_ptr* file); + + struct LogReporter : public log::Reader::Reporter { + Env* env; + Logger* info_log; + virtual void Corruption(size_t bytes, const Status& s) override { + ROCKS_LOG_ERROR(info_log, "dropping %" ROCKSDB_PRIszt " bytes; %s", bytes, + s.ToString().c_str()); + } + virtual void Info(const char* s) { ROCKS_LOG_INFO(info_log, "%s", s); } + } reporter_; + + SequenceNumber + current_batch_seq_; // sequence number at start of current batch + SequenceNumber current_last_seq_; // last sequence in the current batch + // Used only to get latest seq. num + // TODO(icanadi) can this be just a callback? + VersionSet const* const versions_; + const bool seq_per_batch_; + // Reads from transaction log only if the writebatch record has been written + bool RestrictedRead(Slice* record); + // Seeks to startingSequenceNumber reading from startFileIndex in files_. + // If strict is set,then must get a batch starting with startingSequenceNumber + void SeekToStartSequence(uint64_t start_file_index = 0, bool strict = false); + // Implementation of Next. SeekToStartSequence calls it internally with + // internal=true to let it find next entry even if it has to jump gaps because + // the iterator may start off from the first available entry but promises to + // be continuous after that + void NextImpl(bool internal = false); + // Check if batch is expected, else return false + bool IsBatchExpected(const WriteBatch* batch, SequenceNumber expected_seq); + // Update current batch if a continuous batch is found, else return false + void UpdateCurrentWriteBatch(const Slice& record); + Status OpenLogReader(const LogFile* file); +}; +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/db/trim_history_scheduler.cc b/src/rocksdb/db/trim_history_scheduler.cc new file mode 100644 index 000000000..d7ca0899f --- /dev/null +++ b/src/rocksdb/db/trim_history_scheduler.cc @@ -0,0 +1,54 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/trim_history_scheduler.h" + +#include + +#include "db/column_family.h" + +namespace ROCKSDB_NAMESPACE { + +void TrimHistoryScheduler::ScheduleWork(ColumnFamilyData* cfd) { + std::lock_guard lock(checking_mutex_); + cfd->Ref(); + cfds_.push_back(cfd); + is_empty_.store(false, std::memory_order_relaxed); +} + +ColumnFamilyData* TrimHistoryScheduler::TakeNextColumnFamily() { + std::lock_guard lock(checking_mutex_); + while (true) { + if (cfds_.empty()) { + return nullptr; + } + ColumnFamilyData* cfd = cfds_.back(); + cfds_.pop_back(); + if (cfds_.empty()) { + is_empty_.store(true, std::memory_order_relaxed); + } + + if (!cfd->IsDropped()) { + // success + return cfd; + } + cfd->UnrefAndTryDelete(); + } +} + +bool TrimHistoryScheduler::Empty() { + bool is_empty = is_empty_.load(std::memory_order_relaxed); + return is_empty; +} + +void TrimHistoryScheduler::Clear() { + ColumnFamilyData* cfd; + while ((cfd = TakeNextColumnFamily()) != nullptr) { + cfd->UnrefAndTryDelete(); + } + assert(Empty()); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/trim_history_scheduler.h b/src/rocksdb/db/trim_history_scheduler.h new file mode 100644 index 000000000..b17f6170f --- /dev/null +++ b/src/rocksdb/db/trim_history_scheduler.h @@ -0,0 +1,44 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +class ColumnFamilyData; + +// Similar to FlushScheduler, TrimHistoryScheduler is a FIFO queue that keeps +// track of column families whose flushed immutable memtables may need to be +// removed (aka trimmed). The actual trimming may be slightly delayed. Due to +// the use of the mutex and atomic variable, ScheduleWork, +// TakeNextColumnFamily, and, Empty can be called concurrently. +class TrimHistoryScheduler { + public: + TrimHistoryScheduler() : is_empty_(true) {} + + // When a column family needs history trimming, add cfd to the FIFO queue + void ScheduleWork(ColumnFamilyData* cfd); + + // Remove the column family from the queue, the caller is responsible for + // calling `MemtableList::TrimHistory` + ColumnFamilyData* TakeNextColumnFamily(); + + bool Empty(); + + void Clear(); + + // Not on critical path, use mutex to ensure thread safety + private: + std::atomic is_empty_; + autovector cfds_; + std::mutex checking_mutex_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/version_builder.cc b/src/rocksdb/db/version_builder.cc new file mode 100644 index 000000000..4694218a1 --- /dev/null +++ b/src/rocksdb/db/version_builder.cc @@ -0,0 +1,545 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_builder.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "db/internal_stats.h" +#include "db/table_cache.h" +#include "db/version_set.h" +#include "port/port.h" +#include "table/table_reader.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b) { + if (a->fd.largest_seqno != b->fd.largest_seqno) { + return a->fd.largest_seqno > b->fd.largest_seqno; + } + if (a->fd.smallest_seqno != b->fd.smallest_seqno) { + return a->fd.smallest_seqno > b->fd.smallest_seqno; + } + // Break ties by file number + return a->fd.GetNumber() > b->fd.GetNumber(); +} + +namespace { +bool BySmallestKey(FileMetaData* a, FileMetaData* b, + const InternalKeyComparator* cmp) { + int r = cmp->Compare(a->smallest, b->smallest); + if (r != 0) { + return (r < 0); + } + // Break ties by file number + return (a->fd.GetNumber() < b->fd.GetNumber()); +} +} // namespace + +class VersionBuilder::Rep { + private: + // Helper to sort files_ in v + // kLevel0 -- NewestFirstBySeqNo + // kLevelNon0 -- BySmallestKey + struct FileComparator { + enum SortMethod { kLevel0 = 0, kLevelNon0 = 1, } sort_method; + const InternalKeyComparator* internal_comparator; + + FileComparator() : internal_comparator(nullptr) {} + + bool operator()(FileMetaData* f1, FileMetaData* f2) const { + switch (sort_method) { + case kLevel0: + return NewestFirstBySeqNo(f1, f2); + case kLevelNon0: + return BySmallestKey(f1, f2, internal_comparator); + } + assert(false); + return false; + } + }; + + struct LevelState { + std::unordered_set deleted_files; + // Map from file number to file meta data. + std::unordered_map added_files; + }; + + const FileOptions& file_options_; + Logger* info_log_; + TableCache* table_cache_; + VersionStorageInfo* base_vstorage_; + int num_levels_; + LevelState* levels_; + // Store states of levels larger than num_levels_. We do this instead of + // storing them in levels_ to avoid regression in case there are no files + // on invalid levels. The version is not consistent if in the end the files + // on invalid levels don't cancel out. + std::map> invalid_levels_; + // Whether there are invalid new files or invalid deletion on levels larger + // than num_levels_. + bool has_invalid_levels_; + FileComparator level_zero_cmp_; + FileComparator level_nonzero_cmp_; + + public: + Rep(const FileOptions& file_options, Logger* info_log, + TableCache* table_cache, + VersionStorageInfo* base_vstorage) + : file_options_(file_options), + info_log_(info_log), + table_cache_(table_cache), + base_vstorage_(base_vstorage), + num_levels_(base_vstorage->num_levels()), + has_invalid_levels_(false) { + levels_ = new LevelState[num_levels_]; + level_zero_cmp_.sort_method = FileComparator::kLevel0; + level_nonzero_cmp_.sort_method = FileComparator::kLevelNon0; + level_nonzero_cmp_.internal_comparator = + base_vstorage_->InternalComparator(); + } + + ~Rep() { + for (int level = 0; level < num_levels_; level++) { + const auto& added = levels_[level].added_files; + for (auto& pair : added) { + UnrefFile(pair.second); + } + } + + delete[] levels_; + } + + void UnrefFile(FileMetaData* f) { + f->refs--; + if (f->refs <= 0) { + if (f->table_reader_handle) { + assert(table_cache_ != nullptr); + table_cache_->ReleaseHandle(f->table_reader_handle); + f->table_reader_handle = nullptr; + } + delete f; + } + } + + Status CheckConsistency(VersionStorageInfo* vstorage) { +#ifdef NDEBUG + if (!vstorage->force_consistency_checks()) { + // Dont run consistency checks in release mode except if + // explicitly asked to + return Status::OK(); + } +#endif + // make sure the files are sorted correctly + for (int level = 0; level < num_levels_; level++) { + auto& level_files = vstorage->LevelFiles(level); + for (size_t i = 1; i < level_files.size(); i++) { + auto f1 = level_files[i - 1]; + auto f2 = level_files[i]; +#ifndef NDEBUG + auto pair = std::make_pair(&f1, &f2); + TEST_SYNC_POINT_CALLBACK("VersionBuilder::CheckConsistency", &pair); +#endif + if (level == 0) { + if (!level_zero_cmp_(f1, f2)) { + fprintf(stderr, "L0 files are not sorted properly"); + return Status::Corruption("L0 files are not sorted properly"); + } + + if (f2->fd.smallest_seqno == f2->fd.largest_seqno) { + // This is an external file that we ingested + SequenceNumber external_file_seqno = f2->fd.smallest_seqno; + if (!(external_file_seqno < f1->fd.largest_seqno || + external_file_seqno == 0)) { + fprintf(stderr, + "L0 file with seqno %" PRIu64 " %" PRIu64 + " vs. file with global_seqno %" PRIu64 "\n", + f1->fd.smallest_seqno, f1->fd.largest_seqno, + external_file_seqno); + return Status::Corruption( + "L0 file with seqno " + + NumberToString(f1->fd.smallest_seqno) + " " + + NumberToString(f1->fd.largest_seqno) + + " vs. file with global_seqno" + + NumberToString(external_file_seqno) + " with fileNumber " + + NumberToString(f1->fd.GetNumber())); + } + } else if (f1->fd.smallest_seqno <= f2->fd.smallest_seqno) { + fprintf(stderr, + "L0 files seqno %" PRIu64 " %" PRIu64 " vs. %" PRIu64 + " %" PRIu64 "\n", + f1->fd.smallest_seqno, f1->fd.largest_seqno, + f2->fd.smallest_seqno, f2->fd.largest_seqno); + return Status::Corruption( + "L0 files seqno " + NumberToString(f1->fd.smallest_seqno) + + " " + NumberToString(f1->fd.largest_seqno) + " " + + NumberToString(f1->fd.GetNumber()) + " vs. " + + NumberToString(f2->fd.smallest_seqno) + " " + + NumberToString(f2->fd.largest_seqno) + " " + + NumberToString(f2->fd.GetNumber())); + } + } else { + if (!level_nonzero_cmp_(f1, f2)) { + fprintf(stderr, "L%d files are not sorted properly", level); + return Status::Corruption("L" + NumberToString(level) + + " files are not sorted properly"); + } + + // Make sure there is no overlap in levels > 0 + if (vstorage->InternalComparator()->Compare(f1->largest, + f2->smallest) >= 0) { + fprintf(stderr, "L%d have overlapping ranges %s vs. %s\n", level, + (f1->largest).DebugString(true).c_str(), + (f2->smallest).DebugString(true).c_str()); + return Status::Corruption( + "L" + NumberToString(level) + " have overlapping ranges " + + (f1->largest).DebugString(true) + " vs. " + + (f2->smallest).DebugString(true)); + } + } + } + } + return Status::OK(); + } + + Status CheckConsistencyForDeletes(VersionEdit* /*edit*/, uint64_t number, + int level) { +#ifdef NDEBUG + if (!base_vstorage_->force_consistency_checks()) { + // Dont run consistency checks in release mode except if + // explicitly asked to + return Status::OK(); + } +#endif + // a file to be deleted better exist in the previous version + bool found = false; + for (int l = 0; !found && l < num_levels_; l++) { + const std::vector& base_files = + base_vstorage_->LevelFiles(l); + for (size_t i = 0; i < base_files.size(); i++) { + FileMetaData* f = base_files[i]; + if (f->fd.GetNumber() == number) { + found = true; + break; + } + } + } + // if the file did not exist in the previous version, then it + // is possibly moved from lower level to higher level in current + // version + for (int l = level + 1; !found && l < num_levels_; l++) { + auto& level_added = levels_[l].added_files; + auto got = level_added.find(number); + if (got != level_added.end()) { + found = true; + break; + } + } + + // maybe this file was added in a previous edit that was Applied + if (!found) { + auto& level_added = levels_[level].added_files; + auto got = level_added.find(number); + if (got != level_added.end()) { + found = true; + } + } + if (!found) { + fprintf(stderr, "not found %" PRIu64 "\n", number); + return Status::Corruption("not found " + NumberToString(number)); + } + return Status::OK(); + } + + bool CheckConsistencyForNumLevels() { + // Make sure there are no files on or beyond num_levels(). + if (has_invalid_levels_) { + return false; + } + for (auto& level : invalid_levels_) { + if (level.second.size() > 0) { + return false; + } + } + return true; + } + + // Apply all of the edits in *edit to the current state. + Status Apply(VersionEdit* edit) { + Status s = CheckConsistency(base_vstorage_); + if (!s.ok()) { + return s; + } + + // Delete files + const auto& del = edit->GetDeletedFiles(); + for (const auto& del_file : del) { + const auto level = del_file.first; + const auto number = del_file.second; + if (level < num_levels_) { + levels_[level].deleted_files.insert(number); + CheckConsistencyForDeletes(edit, number, level); + + auto exising = levels_[level].added_files.find(number); + if (exising != levels_[level].added_files.end()) { + UnrefFile(exising->second); + levels_[level].added_files.erase(exising); + } + } else { + if (invalid_levels_[level].erase(number) == 0) { + // Deleting an non-existing file on invalid level. + has_invalid_levels_ = true; + } + } + } + + // Add new files + for (const auto& new_file : edit->GetNewFiles()) { + const int level = new_file.first; + if (level < num_levels_) { + FileMetaData* f = new FileMetaData(new_file.second); + f->refs = 1; + + assert(levels_[level].added_files.find(f->fd.GetNumber()) == + levels_[level].added_files.end()); + levels_[level].deleted_files.erase(f->fd.GetNumber()); + levels_[level].added_files[f->fd.GetNumber()] = f; + } else { + uint64_t number = new_file.second.fd.GetNumber(); + auto& lvls = invalid_levels_[level]; + if (lvls.count(number) == 0) { + lvls.insert(number); + } else { + // Creating an already existing file on invalid level. + has_invalid_levels_ = true; + } + } + } + return s; + } + + // Save the current state in *v. + Status SaveTo(VersionStorageInfo* vstorage) { + Status s = CheckConsistency(base_vstorage_); + if (!s.ok()) { + return s; + } + + s = CheckConsistency(vstorage); + if (!s.ok()) { + return s; + } + + for (int level = 0; level < num_levels_; level++) { + const auto& cmp = (level == 0) ? level_zero_cmp_ : level_nonzero_cmp_; + // Merge the set of added files with the set of pre-existing files. + // Drop any deleted files. Store the result in *v. + const auto& base_files = base_vstorage_->LevelFiles(level); + const auto& unordered_added_files = levels_[level].added_files; + vstorage->Reserve(level, + base_files.size() + unordered_added_files.size()); + + // Sort added files for the level. + std::vector added_files; + added_files.reserve(unordered_added_files.size()); + for (const auto& pair : unordered_added_files) { + added_files.push_back(pair.second); + } + std::sort(added_files.begin(), added_files.end(), cmp); + +#ifndef NDEBUG + FileMetaData* prev_added_file = nullptr; + for (const auto& added : added_files) { + if (level > 0 && prev_added_file != nullptr) { + assert(base_vstorage_->InternalComparator()->Compare( + prev_added_file->smallest, added->smallest) <= 0); + } + prev_added_file = added; + } +#endif + + auto base_iter = base_files.begin(); + auto base_end = base_files.end(); + auto added_iter = added_files.begin(); + auto added_end = added_files.end(); + while (added_iter != added_end || base_iter != base_end) { + if (base_iter == base_end || + (added_iter != added_end && cmp(*added_iter, *base_iter))) { + MaybeAddFile(vstorage, level, *added_iter++); + } else { + MaybeAddFile(vstorage, level, *base_iter++); + } + } + } + + s = CheckConsistency(vstorage); + return s; + } + + Status LoadTableHandlers(InternalStats* internal_stats, int max_threads, + bool prefetch_index_and_filter_in_cache, + bool is_initial_load, + const SliceTransform* prefix_extractor) { + assert(table_cache_ != nullptr); + + size_t table_cache_capacity = table_cache_->get_cache()->GetCapacity(); + bool always_load = (table_cache_capacity == TableCache::kInfiniteCapacity); + size_t max_load = port::kMaxSizet; + + if (!always_load) { + // If it is initial loading and not set to always laoding all the + // files, we only load up to kInitialLoadLimit files, to limit the + // time reopening the DB. + const size_t kInitialLoadLimit = 16; + size_t load_limit; + // If the table cache is not 1/4 full, we pin the table handle to + // file metadata to avoid the cache read costs when reading the file. + // The downside of pinning those files is that LRU won't be followed + // for those files. This doesn't matter much because if number of files + // of the DB excceeds table cache capacity, eventually no table reader + // will be pinned and LRU will be followed. + if (is_initial_load) { + load_limit = std::min(kInitialLoadLimit, table_cache_capacity / 4); + } else { + load_limit = table_cache_capacity / 4; + } + + size_t table_cache_usage = table_cache_->get_cache()->GetUsage(); + if (table_cache_usage >= load_limit) { + // TODO (yanqin) find a suitable status code. + return Status::OK(); + } else { + max_load = load_limit - table_cache_usage; + } + } + + // + std::vector> files_meta; + std::vector statuses; + for (int level = 0; level < num_levels_; level++) { + for (auto& file_meta_pair : levels_[level].added_files) { + auto* file_meta = file_meta_pair.second; + // If the file has been opened before, just skip it. + if (!file_meta->table_reader_handle) { + files_meta.emplace_back(file_meta, level); + statuses.emplace_back(Status::OK()); + } + if (files_meta.size() >= max_load) { + break; + } + } + if (files_meta.size() >= max_load) { + break; + } + } + + std::atomic next_file_meta_idx(0); + std::function load_handlers_func([&]() { + while (true) { + size_t file_idx = next_file_meta_idx.fetch_add(1); + if (file_idx >= files_meta.size()) { + break; + } + + auto* file_meta = files_meta[file_idx].first; + int level = files_meta[file_idx].second; + statuses[file_idx] = table_cache_->FindTable( + file_options_, *(base_vstorage_->InternalComparator()), + file_meta->fd, &file_meta->table_reader_handle, prefix_extractor, + false /*no_io */, true /* record_read_stats */, + internal_stats->GetFileReadHist(level), false, level, + prefetch_index_and_filter_in_cache); + if (file_meta->table_reader_handle != nullptr) { + // Load table_reader + file_meta->fd.table_reader = table_cache_->GetTableReaderFromHandle( + file_meta->table_reader_handle); + } + } + }); + + std::vector threads; + for (int i = 1; i < max_threads; i++) { + threads.emplace_back(load_handlers_func); + } + load_handlers_func(); + for (auto& t : threads) { + t.join(); + } + for (const auto& s : statuses) { + if (!s.ok()) { + return s; + } + } + return Status::OK(); + } + + void MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f) { + if (levels_[level].deleted_files.count(f->fd.GetNumber()) > 0) { + // f is to-be-deleted table file + vstorage->RemoveCurrentStats(f); + } else { + vstorage->AddFile(level, f, info_log_); + } + } +}; + +VersionBuilder::VersionBuilder(const FileOptions& file_options, + TableCache* table_cache, + VersionStorageInfo* base_vstorage, + Logger* info_log) + : rep_(new Rep(file_options, info_log, table_cache, base_vstorage)) {} + +VersionBuilder::~VersionBuilder() { delete rep_; } + +Status VersionBuilder::CheckConsistency(VersionStorageInfo* vstorage) { + return rep_->CheckConsistency(vstorage); +} + +Status VersionBuilder::CheckConsistencyForDeletes(VersionEdit* edit, + uint64_t number, int level) { + return rep_->CheckConsistencyForDeletes(edit, number, level); +} + +bool VersionBuilder::CheckConsistencyForNumLevels() { + return rep_->CheckConsistencyForNumLevels(); +} + +Status VersionBuilder::Apply(VersionEdit* edit) { return rep_->Apply(edit); } + +Status VersionBuilder::SaveTo(VersionStorageInfo* vstorage) { + return rep_->SaveTo(vstorage); +} + +Status VersionBuilder::LoadTableHandlers( + InternalStats* internal_stats, int max_threads, + bool prefetch_index_and_filter_in_cache, bool is_initial_load, + const SliceTransform* prefix_extractor) { + return rep_->LoadTableHandlers(internal_stats, max_threads, + prefetch_index_and_filter_in_cache, + is_initial_load, prefix_extractor); +} + +void VersionBuilder::MaybeAddFile(VersionStorageInfo* vstorage, int level, + FileMetaData* f) { + rep_->MaybeAddFile(vstorage, level, f); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/version_builder.h b/src/rocksdb/db/version_builder.h new file mode 100644 index 000000000..87415ed55 --- /dev/null +++ b/src/rocksdb/db/version_builder.h @@ -0,0 +1,48 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +#pragma once +#include "rocksdb/file_system.h" +#include "rocksdb/slice_transform.h" + +namespace ROCKSDB_NAMESPACE { + +class TableCache; +class VersionStorageInfo; +class VersionEdit; +struct FileMetaData; +class InternalStats; + +// A helper class so we can efficiently apply a whole sequence +// of edits to a particular state without creating intermediate +// Versions that contain full copies of the intermediate state. +class VersionBuilder { + public: + VersionBuilder(const FileOptions& file_options, TableCache* table_cache, + VersionStorageInfo* base_vstorage, Logger* info_log = nullptr); + ~VersionBuilder(); + Status CheckConsistency(VersionStorageInfo* vstorage); + Status CheckConsistencyForDeletes(VersionEdit* edit, uint64_t number, + int level); + bool CheckConsistencyForNumLevels(); + Status Apply(VersionEdit* edit); + Status SaveTo(VersionStorageInfo* vstorage); + Status LoadTableHandlers(InternalStats* internal_stats, int max_threads, + bool prefetch_index_and_filter_in_cache, + bool is_initial_load, + const SliceTransform* prefix_extractor); + void MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f); + + private: + class Rep; + Rep* rep_; +}; + +extern bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b); +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/version_builder_test.cc b/src/rocksdb/db/version_builder_test.cc new file mode 100644 index 000000000..2dda03f31 --- /dev/null +++ b/src/rocksdb/db/version_builder_test.cc @@ -0,0 +1,349 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include +#include "db/version_edit.h" +#include "db/version_set.h" +#include "logging/logging.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +class VersionBuilderTest : public testing::Test { + public: + const Comparator* ucmp_; + InternalKeyComparator icmp_; + Options options_; + ImmutableCFOptions ioptions_; + MutableCFOptions mutable_cf_options_; + VersionStorageInfo vstorage_; + uint32_t file_num_; + CompactionOptionsFIFO fifo_options_; + std::vector size_being_compacted_; + + VersionBuilderTest() + : ucmp_(BytewiseComparator()), + icmp_(ucmp_), + ioptions_(options_), + mutable_cf_options_(options_), + vstorage_(&icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel, + nullptr, false), + file_num_(1) { + mutable_cf_options_.RefreshDerivedOptions(ioptions_); + size_being_compacted_.resize(options_.num_levels); + } + + ~VersionBuilderTest() override { + for (int i = 0; i < vstorage_.num_levels(); i++) { + for (auto* f : vstorage_.LevelFiles(i)) { + if (--f->refs == 0) { + delete f; + } + } + } + } + + InternalKey GetInternalKey(const char* ukey, + SequenceNumber smallest_seq = 100) { + return InternalKey(ukey, smallest_seq, kTypeValue); + } + + void Add(int level, uint32_t file_number, const char* smallest, + const char* largest, uint64_t file_size = 0, uint32_t path_id = 0, + SequenceNumber smallest_seq = 100, SequenceNumber largest_seq = 100, + uint64_t num_entries = 0, uint64_t num_deletions = 0, + bool sampled = false, SequenceNumber smallest_seqno = 0, + SequenceNumber largest_seqno = 0) { + assert(level < vstorage_.num_levels()); + FileMetaData* f = new FileMetaData( + file_number, path_id, file_size, GetInternalKey(smallest, smallest_seq), + GetInternalKey(largest, largest_seq), smallest_seqno, largest_seqno, + /* marked_for_compact */ false, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName); + f->compensated_file_size = file_size; + f->num_entries = num_entries; + f->num_deletions = num_deletions; + vstorage_.AddFile(level, f); + if (sampled) { + f->init_stats_from_file = true; + vstorage_.UpdateAccumulatedStats(f); + } + } + + void UpdateVersionStorageInfo() { + vstorage_.UpdateFilesByCompactionPri(ioptions_.compaction_pri); + vstorage_.UpdateNumNonEmptyLevels(); + vstorage_.GenerateFileIndexer(); + vstorage_.GenerateLevelFilesBrief(); + vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_); + vstorage_.GenerateLevel0NonOverlapping(); + vstorage_.SetFinalized(); + } +}; + +void UnrefFilesInVersion(VersionStorageInfo* new_vstorage) { + for (int i = 0; i < new_vstorage->num_levels(); i++) { + for (auto* f : new_vstorage->LevelFiles(i)) { + if (--f->refs == 0) { + delete f; + } + } + } +} + +TEST_F(VersionBuilderTest, ApplyAndSaveTo) { + Add(0, 1U, "150", "200", 100U); + + Add(1, 66U, "150", "200", 100U); + Add(1, 88U, "201", "300", 100U); + + Add(2, 6U, "150", "179", 100U); + Add(2, 7U, "180", "220", 100U); + Add(2, 8U, "221", "300", 100U); + + Add(3, 26U, "150", "170", 100U); + Add(3, 27U, "171", "179", 100U); + Add(3, 28U, "191", "220", 100U); + Add(3, 29U, "221", "300", 100U); + UpdateVersionStorageInfo(); + + VersionEdit version_edit; + version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"), + GetInternalKey("350"), 200, 200, false, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName); + version_edit.DeleteFile(3, 27U); + + EnvOptions env_options; + + VersionBuilder version_builder(env_options, nullptr, &vstorage_); + + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, nullptr, false); + version_builder.Apply(&version_edit); + version_builder.SaveTo(&new_vstorage); + + ASSERT_EQ(400U, new_vstorage.NumLevelBytes(2)); + ASSERT_EQ(300U, new_vstorage.NumLevelBytes(3)); + + UnrefFilesInVersion(&new_vstorage); +} + +TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic) { + ioptions_.level_compaction_dynamic_level_bytes = true; + + Add(0, 1U, "150", "200", 100U, 0, 200U, 200U, 0, 0, false, 200U, 200U); + Add(0, 88U, "201", "300", 100U, 0, 100U, 100U, 0, 0, false, 100U, 100U); + + Add(4, 6U, "150", "179", 100U); + Add(4, 7U, "180", "220", 100U); + Add(4, 8U, "221", "300", 100U); + + Add(5, 26U, "150", "170", 100U); + Add(5, 27U, "171", "179", 100U); + UpdateVersionStorageInfo(); + + VersionEdit version_edit; + version_edit.AddFile(3, 666, 0, 100U, GetInternalKey("301"), + GetInternalKey("350"), 200, 200, false, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName); + version_edit.DeleteFile(0, 1U); + version_edit.DeleteFile(0, 88U); + + EnvOptions env_options; + + VersionBuilder version_builder(env_options, nullptr, &vstorage_); + + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, nullptr, false); + version_builder.Apply(&version_edit); + version_builder.SaveTo(&new_vstorage); + + ASSERT_EQ(0U, new_vstorage.NumLevelBytes(0)); + ASSERT_EQ(100U, new_vstorage.NumLevelBytes(3)); + ASSERT_EQ(300U, new_vstorage.NumLevelBytes(4)); + ASSERT_EQ(200U, new_vstorage.NumLevelBytes(5)); + + UnrefFilesInVersion(&new_vstorage); +} + +TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic2) { + ioptions_.level_compaction_dynamic_level_bytes = true; + + Add(0, 1U, "150", "200", 100U, 0, 200U, 200U, 0, 0, false, 200U, 200U); + Add(0, 88U, "201", "300", 100U, 0, 100U, 100U, 0, 0, false, 100U, 100U); + + Add(4, 6U, "150", "179", 100U); + Add(4, 7U, "180", "220", 100U); + Add(4, 8U, "221", "300", 100U); + + Add(5, 26U, "150", "170", 100U); + Add(5, 27U, "171", "179", 100U); + UpdateVersionStorageInfo(); + + VersionEdit version_edit; + version_edit.AddFile(4, 666, 0, 100U, GetInternalKey("301"), + GetInternalKey("350"), 200, 200, false, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName); + version_edit.DeleteFile(0, 1U); + version_edit.DeleteFile(0, 88U); + version_edit.DeleteFile(4, 6U); + version_edit.DeleteFile(4, 7U); + version_edit.DeleteFile(4, 8U); + + EnvOptions env_options; + + VersionBuilder version_builder(env_options, nullptr, &vstorage_); + + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, nullptr, false); + version_builder.Apply(&version_edit); + version_builder.SaveTo(&new_vstorage); + + ASSERT_EQ(0U, new_vstorage.NumLevelBytes(0)); + ASSERT_EQ(100U, new_vstorage.NumLevelBytes(4)); + ASSERT_EQ(200U, new_vstorage.NumLevelBytes(5)); + + UnrefFilesInVersion(&new_vstorage); +} + +TEST_F(VersionBuilderTest, ApplyMultipleAndSaveTo) { + UpdateVersionStorageInfo(); + + VersionEdit version_edit; + version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"), + GetInternalKey("350"), 200, 200, false, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName); + version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"), + GetInternalKey("450"), 200, 200, false, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName); + version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"), + GetInternalKey("650"), 200, 200, false, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName); + version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"), + GetInternalKey("550"), 200, 200, false, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName); + version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"), + GetInternalKey("750"), 200, 200, false, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName); + + EnvOptions env_options; + + VersionBuilder version_builder(env_options, nullptr, &vstorage_); + + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, nullptr, false); + version_builder.Apply(&version_edit); + version_builder.SaveTo(&new_vstorage); + + ASSERT_EQ(500U, new_vstorage.NumLevelBytes(2)); + + UnrefFilesInVersion(&new_vstorage); +} + +TEST_F(VersionBuilderTest, ApplyDeleteAndSaveTo) { + UpdateVersionStorageInfo(); + + EnvOptions env_options; + VersionBuilder version_builder(env_options, nullptr, &vstorage_); + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, nullptr, false); + + VersionEdit version_edit; + version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"), + GetInternalKey("350"), 200, 200, false, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName); + version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"), + GetInternalKey("450"), 200, 200, false, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName); + version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"), + GetInternalKey("650"), 200, 200, false, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName); + version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"), + GetInternalKey("550"), 200, 200, false, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName); + version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"), + GetInternalKey("750"), 200, 200, false, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName); + version_builder.Apply(&version_edit); + + VersionEdit version_edit2; + version_edit.AddFile(2, 808, 0, 100U, GetInternalKey("901"), + GetInternalKey("950"), 200, 200, false, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName); + version_edit2.DeleteFile(2, 616); + version_edit2.DeleteFile(2, 636); + version_edit.AddFile(2, 806, 0, 100U, GetInternalKey("801"), + GetInternalKey("850"), 200, 200, false, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName); + version_builder.Apply(&version_edit2); + + version_builder.SaveTo(&new_vstorage); + + ASSERT_EQ(300U, new_vstorage.NumLevelBytes(2)); + + UnrefFilesInVersion(&new_vstorage); +} + +TEST_F(VersionBuilderTest, EstimatedActiveKeys) { + const uint32_t kTotalSamples = 20; + const uint32_t kNumLevels = 5; + const uint32_t kFilesPerLevel = 8; + const uint32_t kNumFiles = kNumLevels * kFilesPerLevel; + const uint32_t kEntriesPerFile = 1000; + const uint32_t kDeletionsPerFile = 100; + for (uint32_t i = 0; i < kNumFiles; ++i) { + Add(static_cast(i / kFilesPerLevel), i + 1, + ToString((i + 100) * 1000).c_str(), + ToString((i + 100) * 1000 + 999).c_str(), + 100U, 0, 100, 100, + kEntriesPerFile, kDeletionsPerFile, + (i < kTotalSamples)); + } + // minus 2X for the number of deletion entries because: + // 1x for deletion entry does not count as a data entry. + // 1x for each deletion entry will actually remove one data entry. + ASSERT_EQ(vstorage_.GetEstimatedActiveKeys(), + (kEntriesPerFile - 2 * kDeletionsPerFile) * kNumFiles); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/version_edit.cc b/src/rocksdb/db/version_edit.cc new file mode 100644 index 000000000..e45e82656 --- /dev/null +++ b/src/rocksdb/db/version_edit.cc @@ -0,0 +1,826 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_edit.h" + +#include "db/blob_index.h" +#include "db/version_set.h" +#include "logging/event_logger.h" +#include "rocksdb/slice.h" +#include "test_util/sync_point.h" +#include "util/coding.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +// The unknown file checksum. +const std::string kUnknownFileChecksum(""); +// The unknown sst file checksum function name. +const std::string kUnknownFileChecksumFuncName("Unknown"); +// Mask for an identified tag from the future which can be safely ignored. +const uint32_t kTagSafeIgnoreMask = 1 << 13; + +// Tag numbers for serialized VersionEdit. These numbers are written to +// disk and should not be changed. The number should be forward compatible so +// users can down-grade RocksDB safely. A future Tag is ignored by doing '&' +// between Tag and kTagSafeIgnoreMask field. +enum Tag : uint32_t { + kComparator = 1, + kLogNumber = 2, + kNextFileNumber = 3, + kLastSequence = 4, + kCompactPointer = 5, + kDeletedFile = 6, + kNewFile = 7, + // 8 was used for large value refs + kPrevLogNumber = 9, + kMinLogNumberToKeep = 10, + // Ignore-able field + kDbId = kTagSafeIgnoreMask + 1, + + // these are new formats divergent from open source leveldb + kNewFile2 = 100, + kNewFile3 = 102, + kNewFile4 = 103, // 4th (the latest) format version of adding files + kColumnFamily = 200, // specify column family for version edit + kColumnFamilyAdd = 201, + kColumnFamilyDrop = 202, + kMaxColumnFamily = 203, + + kInAtomicGroup = 300, +}; + +enum CustomTag : uint32_t { + kTerminate = 1, // The end of customized fields + kNeedCompaction = 2, + // Since Manifest is not entirely currently forward-compatible, and the only + // forward-compatible part is the CutsomtTag of kNewFile, we currently encode + // kMinLogNumberToKeep as part of a CustomTag as a hack. This should be + // removed when manifest becomes forward-comptabile. + kMinLogNumberToKeepHack = 3, + kOldestBlobFileNumber = 4, + kOldestAncesterTime = 5, + kFileCreationTime = 6, + kFileChecksum = 7, + kFileChecksumFuncName = 8, + kPathId = 65, +}; +// If this bit for the custom tag is set, opening DB should fail if +// we don't know this field. +uint32_t kCustomTagNonSafeIgnoreMask = 1 << 6; + +uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id) { + assert(number <= kFileNumberMask); + return number | (path_id * (kFileNumberMask + 1)); +} + +void FileMetaData::UpdateBoundaries(const Slice& key, const Slice& value, + SequenceNumber seqno, + ValueType value_type) { + if (smallest.size() == 0) { + smallest.DecodeFrom(key); + } + largest.DecodeFrom(key); + fd.smallest_seqno = std::min(fd.smallest_seqno, seqno); + fd.largest_seqno = std::max(fd.largest_seqno, seqno); + +#ifndef ROCKSDB_LITE + if (value_type == kTypeBlobIndex) { + BlobIndex blob_index; + const Status s = blob_index.DecodeFrom(value); + if (!s.ok()) { + return; + } + + if (blob_index.IsInlined()) { + return; + } + + if (blob_index.HasTTL()) { + return; + } + + // Paranoid check: this should not happen because BlobDB numbers the blob + // files starting from 1. + if (blob_index.file_number() == kInvalidBlobFileNumber) { + return; + } + + if (oldest_blob_file_number == kInvalidBlobFileNumber || + oldest_blob_file_number > blob_index.file_number()) { + oldest_blob_file_number = blob_index.file_number(); + } + } +#else + (void)value; + (void)value_type; +#endif +} + +void VersionEdit::Clear() { + max_level_ = 0; + db_id_.clear(); + comparator_.clear(); + log_number_ = 0; + prev_log_number_ = 0; + next_file_number_ = 0; + max_column_family_ = 0; + min_log_number_to_keep_ = 0; + last_sequence_ = 0; + has_db_id_ = false; + has_comparator_ = false; + has_log_number_ = false; + has_prev_log_number_ = false; + has_next_file_number_ = false; + has_max_column_family_ = false; + has_min_log_number_to_keep_ = false; + has_last_sequence_ = false; + deleted_files_.clear(); + new_files_.clear(); + column_family_ = 0; + is_column_family_add_ = false; + is_column_family_drop_ = false; + column_family_name_.clear(); + is_in_atomic_group_ = false; + remaining_entries_ = 0; +} + +bool VersionEdit::EncodeTo(std::string* dst) const { + if (has_db_id_) { + PutVarint32(dst, kDbId); + PutLengthPrefixedSlice(dst, db_id_); + } + if (has_comparator_) { + PutVarint32(dst, kComparator); + PutLengthPrefixedSlice(dst, comparator_); + } + if (has_log_number_) { + PutVarint32Varint64(dst, kLogNumber, log_number_); + } + if (has_prev_log_number_) { + PutVarint32Varint64(dst, kPrevLogNumber, prev_log_number_); + } + if (has_next_file_number_) { + PutVarint32Varint64(dst, kNextFileNumber, next_file_number_); + } + if (has_max_column_family_) { + PutVarint32Varint32(dst, kMaxColumnFamily, max_column_family_); + } + if (has_last_sequence_) { + PutVarint32Varint64(dst, kLastSequence, last_sequence_); + } + for (const auto& deleted : deleted_files_) { + PutVarint32Varint32Varint64(dst, kDeletedFile, deleted.first /* level */, + deleted.second /* file number */); + } + + bool min_log_num_written = false; + for (size_t i = 0; i < new_files_.size(); i++) { + const FileMetaData& f = new_files_[i].second; + if (!f.smallest.Valid() || !f.largest.Valid()) { + return false; + } + PutVarint32(dst, kNewFile4); + PutVarint32Varint64(dst, new_files_[i].first /* level */, f.fd.GetNumber()); + PutVarint64(dst, f.fd.GetFileSize()); + PutLengthPrefixedSlice(dst, f.smallest.Encode()); + PutLengthPrefixedSlice(dst, f.largest.Encode()); + PutVarint64Varint64(dst, f.fd.smallest_seqno, f.fd.largest_seqno); + // Customized fields' format: + // +-----------------------------+ + // | 1st field's tag (varint32) | + // +-----------------------------+ + // | 1st field's size (varint32) | + // +-----------------------------+ + // | bytes for 1st field | + // | (based on size decoded) | + // +-----------------------------+ + // | | + // | ...... | + // | | + // +-----------------------------+ + // | last field's size (varint32)| + // +-----------------------------+ + // | bytes for last field | + // | (based on size decoded) | + // +-----------------------------+ + // | terminating tag (varint32) | + // +-----------------------------+ + // + // Customized encoding for fields: + // tag kPathId: 1 byte as path_id + // tag kNeedCompaction: + // now only can take one char value 1 indicating need-compaction + // + PutVarint32(dst, CustomTag::kOldestAncesterTime); + std::string varint_oldest_ancester_time; + PutVarint64(&varint_oldest_ancester_time, f.oldest_ancester_time); + TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintOldestAncesterTime", + &varint_oldest_ancester_time); + PutLengthPrefixedSlice(dst, Slice(varint_oldest_ancester_time)); + + PutVarint32(dst, CustomTag::kFileCreationTime); + std::string varint_file_creation_time; + PutVarint64(&varint_file_creation_time, f.file_creation_time); + TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintFileCreationTime", + &varint_file_creation_time); + PutLengthPrefixedSlice(dst, Slice(varint_file_creation_time)); + + PutVarint32(dst, CustomTag::kFileChecksum); + PutLengthPrefixedSlice(dst, Slice(f.file_checksum)); + + PutVarint32(dst, CustomTag::kFileChecksumFuncName); + PutLengthPrefixedSlice(dst, Slice(f.file_checksum_func_name)); + + if (f.fd.GetPathId() != 0) { + PutVarint32(dst, CustomTag::kPathId); + char p = static_cast(f.fd.GetPathId()); + PutLengthPrefixedSlice(dst, Slice(&p, 1)); + } + if (f.marked_for_compaction) { + PutVarint32(dst, CustomTag::kNeedCompaction); + char p = static_cast(1); + PutLengthPrefixedSlice(dst, Slice(&p, 1)); + } + if (has_min_log_number_to_keep_ && !min_log_num_written) { + PutVarint32(dst, CustomTag::kMinLogNumberToKeepHack); + std::string varint_log_number; + PutFixed64(&varint_log_number, min_log_number_to_keep_); + PutLengthPrefixedSlice(dst, Slice(varint_log_number)); + min_log_num_written = true; + } + if (f.oldest_blob_file_number != kInvalidBlobFileNumber) { + PutVarint32(dst, CustomTag::kOldestBlobFileNumber); + std::string oldest_blob_file_number; + PutVarint64(&oldest_blob_file_number, f.oldest_blob_file_number); + PutLengthPrefixedSlice(dst, Slice(oldest_blob_file_number)); + } + TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields", + dst); + + PutVarint32(dst, CustomTag::kTerminate); + } + + // 0 is default and does not need to be explicitly written + if (column_family_ != 0) { + PutVarint32Varint32(dst, kColumnFamily, column_family_); + } + + if (is_column_family_add_) { + PutVarint32(dst, kColumnFamilyAdd); + PutLengthPrefixedSlice(dst, Slice(column_family_name_)); + } + + if (is_column_family_drop_) { + PutVarint32(dst, kColumnFamilyDrop); + } + + if (is_in_atomic_group_) { + PutVarint32(dst, kInAtomicGroup); + PutVarint32(dst, remaining_entries_); + } + return true; +} + +static bool GetInternalKey(Slice* input, InternalKey* dst) { + Slice str; + if (GetLengthPrefixedSlice(input, &str)) { + dst->DecodeFrom(str); + return dst->Valid(); + } else { + return false; + } +} + +bool VersionEdit::GetLevel(Slice* input, int* level, const char** /*msg*/) { + uint32_t v = 0; + if (GetVarint32(input, &v)) { + *level = v; + if (max_level_ < *level) { + max_level_ = *level; + } + return true; + } else { + return false; + } +} + +static bool is_pseudo_new_file_record_pr3488( + const int level, + const uint64_t number, + const uint64_t file_size, + InternalKey& smallest, + InternalKey& largest, + const bool has_min_log_number_to_keep_) { + + if (level == 0 && number == 0 && file_size == 0 && + has_min_log_number_to_keep_) { + InternalKey dummy_key(Slice("dummy_key"), 0ull, ValueType::kTypeValue); + return (*smallest.rep() == *dummy_key.rep() && + *largest.rep() == *dummy_key.rep()); + } else { + return false; + } +} + +const char* VersionEdit::DecodeNewFile4From(Slice* input) { + const char* msg = nullptr; + int level = 0; + FileMetaData f; + uint64_t number = 0; + uint32_t path_id = 0; + uint64_t file_size = 0; + SequenceNumber smallest_seqno = 0; + SequenceNumber largest_seqno = kMaxSequenceNumber; + // Since this is the only forward-compatible part of the code, we hack new + // extension into this record. When we do, we set this boolean to distinguish + // the record from the normal NewFile records. + if (GetLevel(input, &level, &msg) && GetVarint64(input, &number) && + GetVarint64(input, &file_size) && GetInternalKey(input, &f.smallest) && + GetInternalKey(input, &f.largest) && + GetVarint64(input, &smallest_seqno) && + GetVarint64(input, &largest_seqno)) { + // See comments in VersionEdit::EncodeTo() for format of customized fields + while (true) { + uint32_t custom_tag = 0; + Slice field; + if (!GetVarint32(input, &custom_tag)) { + return "new-file4 custom field"; + } + if (custom_tag == kTerminate) { + break; + } + if (!GetLengthPrefixedSlice(input, &field)) { + return "new-file4 custom field length prefixed slice error"; + } + switch (custom_tag) { + case kPathId: + if (field.size() != 1) { + return "path_id field wrong size"; + } + path_id = field[0]; + if (path_id > 3) { + return "path_id wrong vaue"; + } + break; + case kOldestAncesterTime: + if (!GetVarint64(&field, &f.oldest_ancester_time)) { + return "invalid oldest ancester time"; + } + break; + case kFileCreationTime: + if (!GetVarint64(&field, &f.file_creation_time)) { + return "invalid file creation time"; + } + break; + case kFileChecksum: + f.file_checksum = field.ToString(); + break; + case kFileChecksumFuncName: + f.file_checksum_func_name = field.ToString(); + break; + case kNeedCompaction: + if (field.size() != 1) { + return "need_compaction field wrong size"; + } + f.marked_for_compaction = (field[0] == 1); + break; + case kMinLogNumberToKeepHack: + // This is a hack to encode kMinLogNumberToKeep in a + // forward-compatible fashion. + if (!GetFixed64(&field, &min_log_number_to_keep_)) { + return "deleted log number malformatted"; + } + has_min_log_number_to_keep_ = true; + break; + case kOldestBlobFileNumber: + if (!GetVarint64(&field, &f.oldest_blob_file_number)) { + return "invalid oldest blob file number"; + } + break; + default: + if ((custom_tag & kCustomTagNonSafeIgnoreMask) != 0) { + // Should not proceed if cannot understand it + return "new-file4 custom field not supported"; + } + break; + } + } + } else { + return "new-file4 entry"; + } + if (is_pseudo_new_file_record_pr3488(level, number, file_size, + f.smallest, f.largest, + has_min_log_number_to_keep_)) { + // Since this has nothing to do with NewFile, return immediately. + return nullptr; + } + f.fd = + FileDescriptor(number, path_id, file_size, smallest_seqno, largest_seqno); + new_files_.push_back(std::make_pair(level, f)); + return nullptr; +} + +Status VersionEdit::DecodeFrom(const Slice& src) { + Clear(); + Slice input = src; + const char* msg = nullptr; + uint32_t tag = 0; + + // Temporary storage for parsing + int level = 0; + FileMetaData f; + Slice str; + InternalKey key; + while (msg == nullptr && GetVarint32(&input, &tag)) { + switch (tag) { + case kDbId: + if (GetLengthPrefixedSlice(&input, &str)) { + db_id_ = str.ToString(); + has_db_id_ = true; + } else { + msg = "db id"; + } + break; + case kComparator: + if (GetLengthPrefixedSlice(&input, &str)) { + comparator_ = str.ToString(); + has_comparator_ = true; + } else { + msg = "comparator name"; + } + break; + + case kLogNumber: + if (GetVarint64(&input, &log_number_)) { + has_log_number_ = true; + } else { + msg = "log number"; + } + break; + + case kPrevLogNumber: + if (GetVarint64(&input, &prev_log_number_)) { + has_prev_log_number_ = true; + } else { + msg = "previous log number"; + } + break; + + case kNextFileNumber: + if (GetVarint64(&input, &next_file_number_)) { + has_next_file_number_ = true; + } else { + msg = "next file number"; + } + break; + + case kMaxColumnFamily: + if (GetVarint32(&input, &max_column_family_)) { + has_max_column_family_ = true; + } else { + msg = "max column family"; + } + break; + + case kMinLogNumberToKeep: + if (GetVarint64(&input, &min_log_number_to_keep_)) { + has_min_log_number_to_keep_ = true; + } else { + msg = "min log number to kee"; + } + break; + + case kLastSequence: + if (GetVarint64(&input, &last_sequence_)) { + has_last_sequence_ = true; + } else { + msg = "last sequence number"; + } + break; + + case kCompactPointer: + if (GetLevel(&input, &level, &msg) && + GetInternalKey(&input, &key)) { + // we don't use compact pointers anymore, + // but we should not fail if they are still + // in manifest + } else { + if (!msg) { + msg = "compaction pointer"; + } + } + break; + + case kDeletedFile: { + uint64_t number = 0; + if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number)) { + deleted_files_.insert(std::make_pair(level, number)); + } else { + if (!msg) { + msg = "deleted file"; + } + } + break; + } + + case kNewFile: { + uint64_t number = 0; + uint64_t file_size = 0; + if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) && + GetVarint64(&input, &file_size) && + GetInternalKey(&input, &f.smallest) && + GetInternalKey(&input, &f.largest)) { + f.fd = FileDescriptor(number, 0, file_size); + new_files_.push_back(std::make_pair(level, f)); + } else { + if (!msg) { + msg = "new-file entry"; + } + } + break; + } + case kNewFile2: { + uint64_t number = 0; + uint64_t file_size = 0; + SequenceNumber smallest_seqno = 0; + SequenceNumber largest_seqno = kMaxSequenceNumber; + if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) && + GetVarint64(&input, &file_size) && + GetInternalKey(&input, &f.smallest) && + GetInternalKey(&input, &f.largest) && + GetVarint64(&input, &smallest_seqno) && + GetVarint64(&input, &largest_seqno)) { + f.fd = FileDescriptor(number, 0, file_size, smallest_seqno, + largest_seqno); + new_files_.push_back(std::make_pair(level, f)); + } else { + if (!msg) { + msg = "new-file2 entry"; + } + } + break; + } + + case kNewFile3: { + uint64_t number = 0; + uint32_t path_id = 0; + uint64_t file_size = 0; + SequenceNumber smallest_seqno = 0; + SequenceNumber largest_seqno = kMaxSequenceNumber; + if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) && + GetVarint32(&input, &path_id) && GetVarint64(&input, &file_size) && + GetInternalKey(&input, &f.smallest) && + GetInternalKey(&input, &f.largest) && + GetVarint64(&input, &smallest_seqno) && + GetVarint64(&input, &largest_seqno)) { + f.fd = FileDescriptor(number, path_id, file_size, smallest_seqno, + largest_seqno); + new_files_.push_back(std::make_pair(level, f)); + } else { + if (!msg) { + msg = "new-file3 entry"; + } + } + break; + } + + case kNewFile4: { + msg = DecodeNewFile4From(&input); + break; + } + + case kColumnFamily: + if (!GetVarint32(&input, &column_family_)) { + if (!msg) { + msg = "set column family id"; + } + } + break; + + case kColumnFamilyAdd: + if (GetLengthPrefixedSlice(&input, &str)) { + is_column_family_add_ = true; + column_family_name_ = str.ToString(); + } else { + if (!msg) { + msg = "column family add"; + } + } + break; + + case kColumnFamilyDrop: + is_column_family_drop_ = true; + break; + + case kInAtomicGroup: + is_in_atomic_group_ = true; + if (!GetVarint32(&input, &remaining_entries_)) { + if (!msg) { + msg = "remaining entries"; + } + } + break; + + default: + if (tag & kTagSafeIgnoreMask) { + // Tag from future which can be safely ignored. + // The next field must be the length of the entry. + uint32_t field_len; + if (!GetVarint32(&input, &field_len) || + static_cast(field_len) > input.size()) { + if (!msg) { + msg = "safely ignoreable tag length error"; + } + } else { + input.remove_prefix(static_cast(field_len)); + } + } else { + msg = "unknown tag"; + } + break; + } + } + + if (msg == nullptr && !input.empty()) { + msg = "invalid tag"; + } + + Status result; + if (msg != nullptr) { + result = Status::Corruption("VersionEdit", msg); + } + return result; +} + +std::string VersionEdit::DebugString(bool hex_key) const { + std::string r; + r.append("VersionEdit {"); + if (has_db_id_) { + r.append("\n DB ID: "); + r.append(db_id_); + } + if (has_comparator_) { + r.append("\n Comparator: "); + r.append(comparator_); + } + if (has_log_number_) { + r.append("\n LogNumber: "); + AppendNumberTo(&r, log_number_); + } + if (has_prev_log_number_) { + r.append("\n PrevLogNumber: "); + AppendNumberTo(&r, prev_log_number_); + } + if (has_next_file_number_) { + r.append("\n NextFileNumber: "); + AppendNumberTo(&r, next_file_number_); + } + if (has_max_column_family_) { + r.append("\n MaxColumnFamily: "); + AppendNumberTo(&r, max_column_family_); + } + if (has_min_log_number_to_keep_) { + r.append("\n MinLogNumberToKeep: "); + AppendNumberTo(&r, min_log_number_to_keep_); + } + if (has_last_sequence_) { + r.append("\n LastSeq: "); + AppendNumberTo(&r, last_sequence_); + } + for (const auto& deleted_file : deleted_files_) { + r.append("\n DeleteFile: "); + AppendNumberTo(&r, deleted_file.first); + r.append(" "); + AppendNumberTo(&r, deleted_file.second); + } + for (size_t i = 0; i < new_files_.size(); i++) { + const FileMetaData& f = new_files_[i].second; + r.append("\n AddFile: "); + AppendNumberTo(&r, new_files_[i].first); + r.append(" "); + AppendNumberTo(&r, f.fd.GetNumber()); + r.append(" "); + AppendNumberTo(&r, f.fd.GetFileSize()); + r.append(" "); + r.append(f.smallest.DebugString(hex_key)); + r.append(" .. "); + r.append(f.largest.DebugString(hex_key)); + if (f.oldest_blob_file_number != kInvalidBlobFileNumber) { + r.append(" blob_file:"); + AppendNumberTo(&r, f.oldest_blob_file_number); + } + r.append(" oldest_ancester_time:"); + AppendNumberTo(&r, f.oldest_ancester_time); + r.append(" file_creation_time:"); + AppendNumberTo(&r, f.file_creation_time); + r.append(" file_checksum:"); + r.append(f.file_checksum); + r.append(" file_checksum_func_name: "); + r.append(f.file_checksum_func_name); + } + r.append("\n ColumnFamily: "); + AppendNumberTo(&r, column_family_); + if (is_column_family_add_) { + r.append("\n ColumnFamilyAdd: "); + r.append(column_family_name_); + } + if (is_column_family_drop_) { + r.append("\n ColumnFamilyDrop"); + } + if (is_in_atomic_group_) { + r.append("\n AtomicGroup: "); + AppendNumberTo(&r, remaining_entries_); + r.append(" entries remains"); + } + r.append("\n}\n"); + return r; +} + +std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const { + JSONWriter jw; + jw << "EditNumber" << edit_num; + + if (has_db_id_) { + jw << "DB ID" << db_id_; + } + if (has_comparator_) { + jw << "Comparator" << comparator_; + } + if (has_log_number_) { + jw << "LogNumber" << log_number_; + } + if (has_prev_log_number_) { + jw << "PrevLogNumber" << prev_log_number_; + } + if (has_next_file_number_) { + jw << "NextFileNumber" << next_file_number_; + } + if (has_max_column_family_) { + jw << "MaxColumnFamily" << max_column_family_; + } + if (has_min_log_number_to_keep_) { + jw << "MinLogNumberToKeep" << min_log_number_to_keep_; + } + if (has_last_sequence_) { + jw << "LastSeq" << last_sequence_; + } + + if (!deleted_files_.empty()) { + jw << "DeletedFiles"; + jw.StartArray(); + + for (const auto& deleted_file : deleted_files_) { + jw.StartArrayedObject(); + jw << "Level" << deleted_file.first; + jw << "FileNumber" << deleted_file.second; + jw.EndArrayedObject(); + } + + jw.EndArray(); + } + + if (!new_files_.empty()) { + jw << "AddedFiles"; + jw.StartArray(); + + for (size_t i = 0; i < new_files_.size(); i++) { + jw.StartArrayedObject(); + jw << "Level" << new_files_[i].first; + const FileMetaData& f = new_files_[i].second; + jw << "FileNumber" << f.fd.GetNumber(); + jw << "FileSize" << f.fd.GetFileSize(); + jw << "SmallestIKey" << f.smallest.DebugString(hex_key); + jw << "LargestIKey" << f.largest.DebugString(hex_key); + if (f.oldest_blob_file_number != kInvalidBlobFileNumber) { + jw << "OldestBlobFile" << f.oldest_blob_file_number; + } + jw.EndArrayedObject(); + } + + jw.EndArray(); + } + + jw << "ColumnFamily" << column_family_; + + if (is_column_family_add_) { + jw << "ColumnFamilyAdd" << column_family_name_; + } + if (is_column_family_drop_) { + jw << "ColumnFamilyDrop" << column_family_name_; + } + if (is_in_atomic_group_) { + jw << "AtomicGroup" << remaining_entries_; + } + + jw.EndObject(); + + return jw.Get(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/version_edit.h b/src/rocksdb/db/version_edit.h new file mode 100644 index 000000000..6d1893f2a --- /dev/null +++ b/src/rocksdb/db/version_edit.h @@ -0,0 +1,438 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include +#include +#include +#include "db/dbformat.h" +#include "memory/arena.h" +#include "rocksdb/cache.h" +#include "table/table_reader.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +class VersionSet; + +constexpr uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF; +constexpr uint64_t kInvalidBlobFileNumber = 0; +constexpr uint64_t kUnknownOldestAncesterTime = 0; +constexpr uint64_t kUnknownFileCreationTime = 0; + +extern const std::string kUnknownFileChecksum; +extern const std::string kUnknownFileChecksumFuncName; + +extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id); + +// A copyable structure contains information needed to read data from an SST +// file. It can contain a pointer to a table reader opened for the file, or +// file number and size, which can be used to create a new table reader for it. +// The behavior is undefined when a copied of the structure is used when the +// file is not in any live version any more. +struct FileDescriptor { + // Table reader in table_reader_handle + TableReader* table_reader; + uint64_t packed_number_and_path_id; + uint64_t file_size; // File size in bytes + SequenceNumber smallest_seqno; // The smallest seqno in this file + SequenceNumber largest_seqno; // The largest seqno in this file + + FileDescriptor() : FileDescriptor(0, 0, 0) {} + + FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size) + : FileDescriptor(number, path_id, _file_size, kMaxSequenceNumber, 0) {} + + FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size, + SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno) + : table_reader(nullptr), + packed_number_and_path_id(PackFileNumberAndPathId(number, path_id)), + file_size(_file_size), + smallest_seqno(_smallest_seqno), + largest_seqno(_largest_seqno) {} + + FileDescriptor(const FileDescriptor& fd) { *this = fd; } + + FileDescriptor& operator=(const FileDescriptor& fd) { + table_reader = fd.table_reader; + packed_number_and_path_id = fd.packed_number_and_path_id; + file_size = fd.file_size; + smallest_seqno = fd.smallest_seqno; + largest_seqno = fd.largest_seqno; + return *this; + } + + uint64_t GetNumber() const { + return packed_number_and_path_id & kFileNumberMask; + } + uint32_t GetPathId() const { + return static_cast( + packed_number_and_path_id / (kFileNumberMask + 1)); + } + uint64_t GetFileSize() const { return file_size; } +}; + +struct FileSampledStats { + FileSampledStats() : num_reads_sampled(0) {} + FileSampledStats(const FileSampledStats& other) { *this = other; } + FileSampledStats& operator=(const FileSampledStats& other) { + num_reads_sampled = other.num_reads_sampled.load(); + return *this; + } + + // number of user reads to this file. + mutable std::atomic num_reads_sampled; +}; + +struct FileMetaData { + FileDescriptor fd; + InternalKey smallest; // Smallest internal key served by table + InternalKey largest; // Largest internal key served by table + + // Needs to be disposed when refs becomes 0. + Cache::Handle* table_reader_handle = nullptr; + + FileSampledStats stats; + + // Stats for compensating deletion entries during compaction + + // File size compensated by deletion entry. + // This is updated in Version::UpdateAccumulatedStats() first time when the + // file is created or loaded. After it is updated (!= 0), it is immutable. + uint64_t compensated_file_size = 0; + // These values can mutate, but they can only be read or written from + // single-threaded LogAndApply thread + uint64_t num_entries = 0; // the number of entries. + uint64_t num_deletions = 0; // the number of deletion entries. + uint64_t raw_key_size = 0; // total uncompressed key size. + uint64_t raw_value_size = 0; // total uncompressed value size. + + int refs = 0; // Reference count + + bool being_compacted = false; // Is this file undergoing compaction? + bool init_stats_from_file = false; // true if the data-entry stats of this + // file has initialized from file. + + bool marked_for_compaction = false; // True if client asked us nicely to + // compact this file. + + // Used only in BlobDB. The file number of the oldest blob file this SST file + // refers to. 0 is an invalid value; BlobDB numbers the files starting from 1. + uint64_t oldest_blob_file_number = kInvalidBlobFileNumber; + + // The file could be the compaction output from other SST files, which could + // in turn be outputs for compact older SST files. We track the memtable + // flush timestamp for the oldest SST file that eventaully contribute data + // to this file. 0 means the information is not available. + uint64_t oldest_ancester_time = kUnknownOldestAncesterTime; + + // Unix time when the SST file is created. + uint64_t file_creation_time = kUnknownFileCreationTime; + + // File checksum + std::string file_checksum = kUnknownFileChecksum; + + // File checksum function name + std::string file_checksum_func_name = kUnknownFileChecksumFuncName; + + FileMetaData() = default; + + FileMetaData(uint64_t file, uint32_t file_path_id, uint64_t file_size, + const InternalKey& smallest_key, const InternalKey& largest_key, + const SequenceNumber& smallest_seq, + const SequenceNumber& largest_seq, bool marked_for_compact, + uint64_t oldest_blob_file, uint64_t _oldest_ancester_time, + uint64_t _file_creation_time, const std::string& _file_checksum, + const std::string& _file_checksum_func_name) + : fd(file, file_path_id, file_size, smallest_seq, largest_seq), + smallest(smallest_key), + largest(largest_key), + marked_for_compaction(marked_for_compact), + oldest_blob_file_number(oldest_blob_file), + oldest_ancester_time(_oldest_ancester_time), + file_creation_time(_file_creation_time), + file_checksum(_file_checksum), + file_checksum_func_name(_file_checksum_func_name) { + TEST_SYNC_POINT_CALLBACK("FileMetaData::FileMetaData", this); + } + + // REQUIRED: Keys must be given to the function in sorted order (it expects + // the last key to be the largest). + void UpdateBoundaries(const Slice& key, const Slice& value, + SequenceNumber seqno, ValueType value_type); + + // Unlike UpdateBoundaries, ranges do not need to be presented in any + // particular order. + void UpdateBoundariesForRange(const InternalKey& start, + const InternalKey& end, SequenceNumber seqno, + const InternalKeyComparator& icmp) { + if (smallest.size() == 0 || icmp.Compare(start, smallest) < 0) { + smallest = start; + } + if (largest.size() == 0 || icmp.Compare(largest, end) < 0) { + largest = end; + } + fd.smallest_seqno = std::min(fd.smallest_seqno, seqno); + fd.largest_seqno = std::max(fd.largest_seqno, seqno); + } + + // Try to get oldest ancester time from the class itself or table properties + // if table reader is already pinned. + // 0 means the information is not available. + uint64_t TryGetOldestAncesterTime() { + if (oldest_ancester_time != kUnknownOldestAncesterTime) { + return oldest_ancester_time; + } else if (fd.table_reader != nullptr && + fd.table_reader->GetTableProperties() != nullptr) { + return fd.table_reader->GetTableProperties()->creation_time; + } + return kUnknownOldestAncesterTime; + } + + uint64_t TryGetFileCreationTime() { + if (file_creation_time != kUnknownFileCreationTime) { + return file_creation_time; + } else if (fd.table_reader != nullptr && + fd.table_reader->GetTableProperties() != nullptr) { + return fd.table_reader->GetTableProperties()->file_creation_time; + } + return kUnknownFileCreationTime; + } +}; + +// A compressed copy of file meta data that just contain minimum data needed +// to server read operations, while still keeping the pointer to full metadata +// of the file in case it is needed. +struct FdWithKeyRange { + FileDescriptor fd; + FileMetaData* file_metadata; // Point to all metadata + Slice smallest_key; // slice that contain smallest key + Slice largest_key; // slice that contain largest key + + FdWithKeyRange() + : fd(), + file_metadata(nullptr), + smallest_key(), + largest_key() { + } + + FdWithKeyRange(FileDescriptor _fd, Slice _smallest_key, Slice _largest_key, + FileMetaData* _file_metadata) + : fd(_fd), + file_metadata(_file_metadata), + smallest_key(_smallest_key), + largest_key(_largest_key) {} +}; + +// Data structure to store an array of FdWithKeyRange in one level +// Actual data is guaranteed to be stored closely +struct LevelFilesBrief { + size_t num_files; + FdWithKeyRange* files; + LevelFilesBrief() { + num_files = 0; + files = nullptr; + } +}; + +// The state of a DB at any given time is referred to as a Version. +// Any modification to the Version is considered a Version Edit. A Version is +// constructed by joining a sequence of Version Edits. Version Edits are written +// to the MANIFEST file. +class VersionEdit { + public: + void Clear(); + + void SetDBId(const std::string& db_id) { + has_db_id_ = true; + db_id_ = db_id; + } + bool HasDbId() const { return has_db_id_; } + const std::string& GetDbId() const { return db_id_; } + + void SetComparatorName(const Slice& name) { + has_comparator_ = true; + comparator_ = name.ToString(); + } + bool HasComparatorName() const { return has_comparator_; } + const std::string& GetComparatorName() const { return comparator_; } + + void SetLogNumber(uint64_t num) { + has_log_number_ = true; + log_number_ = num; + } + bool HasLogNumber() const { return has_log_number_; } + uint64_t GetLogNumber() const { return log_number_; } + + void SetPrevLogNumber(uint64_t num) { + has_prev_log_number_ = true; + prev_log_number_ = num; + } + bool HasPrevLogNumber() const { return has_prev_log_number_; } + uint64_t GetPrevLogNumber() const { return prev_log_number_; } + + void SetNextFile(uint64_t num) { + has_next_file_number_ = true; + next_file_number_ = num; + } + bool HasNextFile() const { return has_next_file_number_; } + uint64_t GetNextFile() const { return next_file_number_; } + + void SetMaxColumnFamily(uint32_t max_column_family) { + has_max_column_family_ = true; + max_column_family_ = max_column_family; + } + bool HasMaxColumnFamily() const { return has_max_column_family_; } + uint32_t GetMaxColumnFamily() const { return max_column_family_; } + + void SetMinLogNumberToKeep(uint64_t num) { + has_min_log_number_to_keep_ = true; + min_log_number_to_keep_ = num; + } + bool HasMinLogNumberToKeep() const { return has_min_log_number_to_keep_; } + uint64_t GetMinLogNumberToKeep() const { return min_log_number_to_keep_; } + + void SetLastSequence(SequenceNumber seq) { + has_last_sequence_ = true; + last_sequence_ = seq; + } + bool HasLastSequence() const { return has_last_sequence_; } + SequenceNumber GetLastSequence() const { return last_sequence_; } + + // Delete the specified "file" from the specified "level". + void DeleteFile(int level, uint64_t file) { + deleted_files_.emplace(level, file); + } + + // Retrieve the files deleted as well as their associated levels. + using DeletedFiles = std::set>; + const DeletedFiles& GetDeletedFiles() const { return deleted_files_; } + + // Add the specified file at the specified level. + // REQUIRES: This version has not been saved (see VersionSet::SaveTo) + // REQUIRES: "smallest" and "largest" are smallest and largest keys in file + // REQUIRES: "oldest_blob_file_number" is the number of the oldest blob file + // referred to by this file if any, kInvalidBlobFileNumber otherwise. + void AddFile(int level, uint64_t file, uint32_t file_path_id, + uint64_t file_size, const InternalKey& smallest, + const InternalKey& largest, const SequenceNumber& smallest_seqno, + const SequenceNumber& largest_seqno, bool marked_for_compaction, + uint64_t oldest_blob_file_number, uint64_t oldest_ancester_time, + uint64_t file_creation_time, const std::string& file_checksum, + const std::string& file_checksum_func_name) { + assert(smallest_seqno <= largest_seqno); + new_files_.emplace_back( + level, FileMetaData(file, file_path_id, file_size, smallest, largest, + smallest_seqno, largest_seqno, + marked_for_compaction, oldest_blob_file_number, + oldest_ancester_time, file_creation_time, + file_checksum, file_checksum_func_name)); + } + + void AddFile(int level, const FileMetaData& f) { + assert(f.fd.smallest_seqno <= f.fd.largest_seqno); + new_files_.emplace_back(level, f); + } + + // Retrieve the files added as well as their associated levels. + using NewFiles = std::vector>; + const NewFiles& GetNewFiles() const { return new_files_; } + + // Number of edits + size_t NumEntries() const { return new_files_.size() + deleted_files_.size(); } + + void SetColumnFamily(uint32_t column_family_id) { + column_family_ = column_family_id; + } + uint32_t GetColumnFamily() const { return column_family_; } + + // set column family ID by calling SetColumnFamily() + void AddColumnFamily(const std::string& name) { + assert(!is_column_family_drop_); + assert(!is_column_family_add_); + assert(NumEntries() == 0); + is_column_family_add_ = true; + column_family_name_ = name; + } + + // set column family ID by calling SetColumnFamily() + void DropColumnFamily() { + assert(!is_column_family_drop_); + assert(!is_column_family_add_); + assert(NumEntries() == 0); + is_column_family_drop_ = true; + } + + bool IsColumnFamilyManipulation() const { + return is_column_family_add_ || is_column_family_drop_; + } + + void MarkAtomicGroup(uint32_t remaining_entries) { + is_in_atomic_group_ = true; + remaining_entries_ = remaining_entries; + } + bool IsInAtomicGroup() const { return is_in_atomic_group_; } + uint32_t GetRemainingEntries() const { return remaining_entries_; } + + // return true on success. + bool EncodeTo(std::string* dst) const; + Status DecodeFrom(const Slice& src); + + std::string DebugString(bool hex_key = false) const; + std::string DebugJSON(int edit_num, bool hex_key = false) const; + + private: + friend class ReactiveVersionSet; + friend class VersionSet; + friend class Version; + friend class AtomicGroupReadBuffer; + + bool GetLevel(Slice* input, int* level, const char** msg); + + const char* DecodeNewFile4From(Slice* input); + + int max_level_ = 0; + std::string db_id_; + std::string comparator_; + uint64_t log_number_ = 0; + uint64_t prev_log_number_ = 0; + uint64_t next_file_number_ = 0; + uint32_t max_column_family_ = 0; + // The most recent WAL log number that is deleted + uint64_t min_log_number_to_keep_ = 0; + SequenceNumber last_sequence_ = 0; + bool has_db_id_ = false; + bool has_comparator_ = false; + bool has_log_number_ = false; + bool has_prev_log_number_ = false; + bool has_next_file_number_ = false; + bool has_max_column_family_ = false; + bool has_min_log_number_to_keep_ = false; + bool has_last_sequence_ = false; + + DeletedFiles deleted_files_; + NewFiles new_files_; + + // Each version edit record should have column_family_ set + // If it's not set, it is default (0) + uint32_t column_family_ = 0; + // a version edit can be either column_family add or + // column_family drop. If it's column family add, + // it also includes column family name. + bool is_column_family_drop_ = false; + bool is_column_family_add_ = false; + std::string column_family_name_; + + bool is_in_atomic_group_ = false; + uint32_t remaining_entries_ = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/version_edit_test.cc b/src/rocksdb/db/version_edit_test.cc new file mode 100644 index 000000000..8bc884df9 --- /dev/null +++ b/src/rocksdb/db/version_edit_test.cc @@ -0,0 +1,286 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_edit.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +static void TestEncodeDecode(const VersionEdit& edit) { + std::string encoded, encoded2; + edit.EncodeTo(&encoded); + VersionEdit parsed; + Status s = parsed.DecodeFrom(encoded); + ASSERT_TRUE(s.ok()) << s.ToString(); + parsed.EncodeTo(&encoded2); + ASSERT_EQ(encoded, encoded2); +} + +class VersionEditTest : public testing::Test {}; + +TEST_F(VersionEditTest, EncodeDecode) { + static const uint64_t kBig = 1ull << 50; + static const uint32_t kBig32Bit = 1ull << 30; + + VersionEdit edit; + for (int i = 0; i < 4; i++) { + TestEncodeDecode(edit); + edit.AddFile(3, kBig + 300 + i, kBig32Bit + 400 + i, 0, + InternalKey("foo", kBig + 500 + i, kTypeValue), + InternalKey("zoo", kBig + 600 + i, kTypeDeletion), + kBig + 500 + i, kBig + 600 + i, false, kInvalidBlobFileNumber, + 888, 678, "234", "crc32c"); + edit.DeleteFile(4, kBig + 700 + i); + } + + edit.SetComparatorName("foo"); + edit.SetLogNumber(kBig + 100); + edit.SetNextFile(kBig + 200); + edit.SetLastSequence(kBig + 1000); + TestEncodeDecode(edit); +} + +TEST_F(VersionEditTest, EncodeDecodeNewFile4) { + static const uint64_t kBig = 1ull << 50; + + VersionEdit edit; + edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue), + InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500, + kBig + 600, true, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName); + edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue), + InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501, + kBig + 601, false, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName); + edit.AddFile(5, 302, 0, 100, InternalKey("foo", kBig + 502, kTypeValue), + InternalKey("zoo", kBig + 602, kTypeDeletion), kBig + 502, + kBig + 602, true, kInvalidBlobFileNumber, 666, 888, + kUnknownFileChecksum, kUnknownFileChecksumFuncName); + edit.AddFile(5, 303, 0, 100, InternalKey("foo", kBig + 503, kTypeBlobIndex), + InternalKey("zoo", kBig + 603, kTypeBlobIndex), kBig + 503, + kBig + 603, true, 1001, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName); + ; + + edit.DeleteFile(4, 700); + + edit.SetComparatorName("foo"); + edit.SetLogNumber(kBig + 100); + edit.SetNextFile(kBig + 200); + edit.SetLastSequence(kBig + 1000); + TestEncodeDecode(edit); + + std::string encoded, encoded2; + edit.EncodeTo(&encoded); + VersionEdit parsed; + Status s = parsed.DecodeFrom(encoded); + ASSERT_TRUE(s.ok()) << s.ToString(); + auto& new_files = parsed.GetNewFiles(); + ASSERT_TRUE(new_files[0].second.marked_for_compaction); + ASSERT_TRUE(!new_files[1].second.marked_for_compaction); + ASSERT_TRUE(new_files[2].second.marked_for_compaction); + ASSERT_TRUE(new_files[3].second.marked_for_compaction); + ASSERT_EQ(3u, new_files[0].second.fd.GetPathId()); + ASSERT_EQ(3u, new_files[1].second.fd.GetPathId()); + ASSERT_EQ(0u, new_files[2].second.fd.GetPathId()); + ASSERT_EQ(0u, new_files[3].second.fd.GetPathId()); + ASSERT_EQ(kInvalidBlobFileNumber, + new_files[0].second.oldest_blob_file_number); + ASSERT_EQ(kInvalidBlobFileNumber, + new_files[1].second.oldest_blob_file_number); + ASSERT_EQ(kInvalidBlobFileNumber, + new_files[2].second.oldest_blob_file_number); + ASSERT_EQ(1001, new_files[3].second.oldest_blob_file_number); +} + +TEST_F(VersionEditTest, ForwardCompatibleNewFile4) { + static const uint64_t kBig = 1ull << 50; + VersionEdit edit; + edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue), + InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500, + kBig + 600, true, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName); + edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue), + InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501, + kBig + 601, false, kInvalidBlobFileNumber, 686, 868, "234", + "crc32c"); + edit.DeleteFile(4, 700); + + edit.SetComparatorName("foo"); + edit.SetLogNumber(kBig + 100); + edit.SetNextFile(kBig + 200); + edit.SetLastSequence(kBig + 1000); + + std::string encoded; + + // Call back function to add extra customized builds. + bool first = true; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "VersionEdit::EncodeTo:NewFile4:CustomizeFields", [&](void* arg) { + std::string* str = reinterpret_cast(arg); + PutVarint32(str, 33); + const std::string str1 = "random_string"; + PutLengthPrefixedSlice(str, str1); + if (first) { + first = false; + PutVarint32(str, 22); + const std::string str2 = "s"; + PutLengthPrefixedSlice(str, str2); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + edit.EncodeTo(&encoded); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + VersionEdit parsed; + Status s = parsed.DecodeFrom(encoded); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_TRUE(!first); + auto& new_files = parsed.GetNewFiles(); + ASSERT_TRUE(new_files[0].second.marked_for_compaction); + ASSERT_TRUE(!new_files[1].second.marked_for_compaction); + ASSERT_EQ(3u, new_files[0].second.fd.GetPathId()); + ASSERT_EQ(3u, new_files[1].second.fd.GetPathId()); + ASSERT_EQ(1u, parsed.GetDeletedFiles().size()); +} + +TEST_F(VersionEditTest, NewFile4NotSupportedField) { + static const uint64_t kBig = 1ull << 50; + VersionEdit edit; + edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue), + InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500, + kBig + 600, true, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName); + + edit.SetComparatorName("foo"); + edit.SetLogNumber(kBig + 100); + edit.SetNextFile(kBig + 200); + edit.SetLastSequence(kBig + 1000); + + std::string encoded; + + // Call back function to add extra customized builds. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "VersionEdit::EncodeTo:NewFile4:CustomizeFields", [&](void* arg) { + std::string* str = reinterpret_cast(arg); + const std::string str1 = "s"; + PutLengthPrefixedSlice(str, str1); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + edit.EncodeTo(&encoded); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + VersionEdit parsed; + Status s = parsed.DecodeFrom(encoded); + ASSERT_NOK(s); +} + +TEST_F(VersionEditTest, EncodeEmptyFile) { + VersionEdit edit; + edit.AddFile(0, 0, 0, 0, InternalKey(), InternalKey(), 0, 0, false, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName); + std::string buffer; + ASSERT_TRUE(!edit.EncodeTo(&buffer)); +} + +TEST_F(VersionEditTest, ColumnFamilyTest) { + VersionEdit edit; + edit.SetColumnFamily(2); + edit.AddColumnFamily("column_family"); + edit.SetMaxColumnFamily(5); + TestEncodeDecode(edit); + + edit.Clear(); + edit.SetColumnFamily(3); + edit.DropColumnFamily(); + TestEncodeDecode(edit); +} + +TEST_F(VersionEditTest, MinLogNumberToKeep) { + VersionEdit edit; + edit.SetMinLogNumberToKeep(13); + TestEncodeDecode(edit); + + edit.Clear(); + edit.SetMinLogNumberToKeep(23); + TestEncodeDecode(edit); +} + +TEST_F(VersionEditTest, AtomicGroupTest) { + VersionEdit edit; + edit.MarkAtomicGroup(1); + TestEncodeDecode(edit); +} + +TEST_F(VersionEditTest, IgnorableField) { + VersionEdit ve; + std::string encoded; + + // Size of ignorable field is too large + PutVarint32Varint64(&encoded, 2 /* kLogNumber */, 66); + // This is a customized ignorable tag + PutVarint32Varint64(&encoded, + 0x2710 /* A field with kTagSafeIgnoreMask set */, + 5 /* fieldlength 5 */); + encoded += "abc"; // Only fills 3 bytes, + ASSERT_NOK(ve.DecodeFrom(encoded)); + + encoded.clear(); + // Error when seeing unidentified tag that is not ignorable + PutVarint32Varint64(&encoded, 2 /* kLogNumber */, 66); + // This is a customized ignorable tag + PutVarint32Varint64(&encoded, 666 /* A field with kTagSafeIgnoreMask unset */, + 3 /* fieldlength 3 */); + encoded += "abc"; // Fill 3 bytes + PutVarint32Varint64(&encoded, 3 /* next file number */, 88); + ASSERT_NOK(ve.DecodeFrom(encoded)); + + // Safely ignore an identified but safely ignorable entry + encoded.clear(); + PutVarint32Varint64(&encoded, 2 /* kLogNumber */, 66); + // This is a customized ignorable tag + PutVarint32Varint64(&encoded, + 0x2710 /* A field with kTagSafeIgnoreMask set */, + 3 /* fieldlength 3 */); + encoded += "abc"; // Fill 3 bytes + PutVarint32Varint64(&encoded, 3 /* kNextFileNumber */, 88); + + ASSERT_OK(ve.DecodeFrom(encoded)); + + ASSERT_TRUE(ve.HasLogNumber()); + ASSERT_TRUE(ve.HasNextFile()); + ASSERT_EQ(66, ve.GetLogNumber()); + ASSERT_EQ(88, ve.GetNextFile()); +} + +TEST_F(VersionEditTest, DbId) { + VersionEdit edit; + edit.SetDBId("ab34-cd12-435f-er00"); + TestEncodeDecode(edit); + + edit.Clear(); + edit.SetDBId("34ba-cd12-435f-er01"); + TestEncodeDecode(edit); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/version_set.cc b/src/rocksdb/db/version_set.cc new file mode 100644 index 000000000..e913a97dd --- /dev/null +++ b/src/rocksdb/db/version_set.cc @@ -0,0 +1,6005 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_set.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "compaction/compaction.h" +#include "db/internal_stats.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/merge_context.h" +#include "db/merge_helper.h" +#include "db/pinned_iterators_manager.h" +#include "db/table_cache.h" +#include "db/version_builder.h" +#include "file/filename.h" +#include "file/random_access_file_reader.h" +#include "file/read_write_util.h" +#include "file/writable_file_writer.h" +#include "monitoring/file_read_sample.h" +#include "monitoring/perf_context_imp.h" +#include "monitoring/persistent_stats_history.h" +#include "rocksdb/env.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/write_buffer_manager.h" +#include "table/format.h" +#include "table/get_context.h" +#include "table/internal_iterator.h" +#include "table/merging_iterator.h" +#include "table/meta_blocks.h" +#include "table/multiget_context.h" +#include "table/plain/plain_table_factory.h" +#include "table/table_reader.h" +#include "table/two_level_iterator.h" +#include "test_util/sync_point.h" +#include "util/coding.h" +#include "util/stop_watch.h" +#include "util/string_util.h" +#include "util/user_comparator_wrapper.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +// Find File in LevelFilesBrief data structure +// Within an index range defined by left and right +int FindFileInRange(const InternalKeyComparator& icmp, + const LevelFilesBrief& file_level, + const Slice& key, + uint32_t left, + uint32_t right) { + auto cmp = [&](const FdWithKeyRange& f, const Slice& k) -> bool { + return icmp.InternalKeyComparator::Compare(f.largest_key, k) < 0; + }; + const auto &b = file_level.files; + return static_cast(std::lower_bound(b + left, + b + right, key, cmp) - b); +} + +Status OverlapWithIterator(const Comparator* ucmp, + const Slice& smallest_user_key, + const Slice& largest_user_key, + InternalIterator* iter, + bool* overlap) { + InternalKey range_start(smallest_user_key, kMaxSequenceNumber, + kValueTypeForSeek); + iter->Seek(range_start.Encode()); + if (!iter->status().ok()) { + return iter->status(); + } + + *overlap = false; + if (iter->Valid()) { + ParsedInternalKey seek_result; + if (!ParseInternalKey(iter->key(), &seek_result)) { + return Status::Corruption("DB have corrupted keys"); + } + + if (ucmp->CompareWithoutTimestamp(seek_result.user_key, largest_user_key) <= + 0) { + *overlap = true; + } + } + + return iter->status(); +} + +// Class to help choose the next file to search for the particular key. +// Searches and returns files level by level. +// We can search level-by-level since entries never hop across +// levels. Therefore we are guaranteed that if we find data +// in a smaller level, later levels are irrelevant (unless we +// are MergeInProgress). +class FilePicker { + public: + FilePicker(std::vector* files, const Slice& user_key, + const Slice& ikey, autovector* file_levels, + unsigned int num_levels, FileIndexer* file_indexer, + const Comparator* user_comparator, + const InternalKeyComparator* internal_comparator) + : num_levels_(num_levels), + curr_level_(static_cast(-1)), + returned_file_level_(static_cast(-1)), + hit_file_level_(static_cast(-1)), + search_left_bound_(0), + search_right_bound_(FileIndexer::kLevelMaxIndex), +#ifndef NDEBUG + files_(files), +#endif + level_files_brief_(file_levels), + is_hit_file_last_in_level_(false), + curr_file_level_(nullptr), + user_key_(user_key), + ikey_(ikey), + file_indexer_(file_indexer), + user_comparator_(user_comparator), + internal_comparator_(internal_comparator) { +#ifdef NDEBUG + (void)files; +#endif + // Setup member variables to search first level. + search_ended_ = !PrepareNextLevel(); + if (!search_ended_) { + // Prefetch Level 0 table data to avoid cache miss if possible. + for (unsigned int i = 0; i < (*level_files_brief_)[0].num_files; ++i) { + auto* r = (*level_files_brief_)[0].files[i].fd.table_reader; + if (r) { + r->Prepare(ikey); + } + } + } + } + + int GetCurrentLevel() const { return curr_level_; } + + FdWithKeyRange* GetNextFile() { + while (!search_ended_) { // Loops over different levels. + while (curr_index_in_curr_level_ < curr_file_level_->num_files) { + // Loops over all files in current level. + FdWithKeyRange* f = &curr_file_level_->files[curr_index_in_curr_level_]; + hit_file_level_ = curr_level_; + is_hit_file_last_in_level_ = + curr_index_in_curr_level_ == curr_file_level_->num_files - 1; + int cmp_largest = -1; + + // Do key range filtering of files or/and fractional cascading if: + // (1) not all the files are in level 0, or + // (2) there are more than 3 current level files + // If there are only 3 or less current level files in the system, we skip + // the key range filtering. In this case, more likely, the system is + // highly tuned to minimize number of tables queried by each query, + // so it is unlikely that key range filtering is more efficient than + // querying the files. + if (num_levels_ > 1 || curr_file_level_->num_files > 3) { + // Check if key is within a file's range. If search left bound and + // right bound point to the same find, we are sure key falls in + // range. + assert(curr_level_ == 0 || + curr_index_in_curr_level_ == start_index_in_curr_level_ || + user_comparator_->CompareWithoutTimestamp( + user_key_, ExtractUserKey(f->smallest_key)) <= 0); + + int cmp_smallest = user_comparator_->CompareWithoutTimestamp( + user_key_, ExtractUserKey(f->smallest_key)); + if (cmp_smallest >= 0) { + cmp_largest = user_comparator_->CompareWithoutTimestamp( + user_key_, ExtractUserKey(f->largest_key)); + } + + // Setup file search bound for the next level based on the + // comparison results + if (curr_level_ > 0) { + file_indexer_->GetNextLevelIndex(curr_level_, + curr_index_in_curr_level_, + cmp_smallest, cmp_largest, + &search_left_bound_, + &search_right_bound_); + } + // Key falls out of current file's range + if (cmp_smallest < 0 || cmp_largest > 0) { + if (curr_level_ == 0) { + ++curr_index_in_curr_level_; + continue; + } else { + // Search next level. + break; + } + } + } +#ifndef NDEBUG + // Sanity check to make sure that the files are correctly sorted + if (prev_file_) { + if (curr_level_ != 0) { + int comp_sign = internal_comparator_->Compare( + prev_file_->largest_key, f->smallest_key); + assert(comp_sign < 0); + } else { + // level == 0, the current file cannot be newer than the previous + // one. Use compressed data structure, has no attribute seqNo + assert(curr_index_in_curr_level_ > 0); + assert(!NewestFirstBySeqNo(files_[0][curr_index_in_curr_level_], + files_[0][curr_index_in_curr_level_-1])); + } + } + prev_file_ = f; +#endif + returned_file_level_ = curr_level_; + if (curr_level_ > 0 && cmp_largest < 0) { + // No more files to search in this level. + search_ended_ = !PrepareNextLevel(); + } else { + ++curr_index_in_curr_level_; + } + return f; + } + // Start searching next level. + search_ended_ = !PrepareNextLevel(); + } + // Search ended. + return nullptr; + } + + // getter for current file level + // for GET_HIT_L0, GET_HIT_L1 & GET_HIT_L2_AND_UP counts + unsigned int GetHitFileLevel() { return hit_file_level_; } + + // Returns true if the most recent "hit file" (i.e., one returned by + // GetNextFile()) is at the last index in its level. + bool IsHitFileLastInLevel() { return is_hit_file_last_in_level_; } + + private: + unsigned int num_levels_; + unsigned int curr_level_; + unsigned int returned_file_level_; + unsigned int hit_file_level_; + int32_t search_left_bound_; + int32_t search_right_bound_; +#ifndef NDEBUG + std::vector* files_; +#endif + autovector* level_files_brief_; + bool search_ended_; + bool is_hit_file_last_in_level_; + LevelFilesBrief* curr_file_level_; + unsigned int curr_index_in_curr_level_; + unsigned int start_index_in_curr_level_; + Slice user_key_; + Slice ikey_; + FileIndexer* file_indexer_; + const Comparator* user_comparator_; + const InternalKeyComparator* internal_comparator_; +#ifndef NDEBUG + FdWithKeyRange* prev_file_; +#endif + + // Setup local variables to search next level. + // Returns false if there are no more levels to search. + bool PrepareNextLevel() { + curr_level_++; + while (curr_level_ < num_levels_) { + curr_file_level_ = &(*level_files_brief_)[curr_level_]; + if (curr_file_level_->num_files == 0) { + // When current level is empty, the search bound generated from upper + // level must be [0, -1] or [0, FileIndexer::kLevelMaxIndex] if it is + // also empty. + assert(search_left_bound_ == 0); + assert(search_right_bound_ == -1 || + search_right_bound_ == FileIndexer::kLevelMaxIndex); + // Since current level is empty, it will need to search all files in + // the next level + search_left_bound_ = 0; + search_right_bound_ = FileIndexer::kLevelMaxIndex; + curr_level_++; + continue; + } + + // Some files may overlap each other. We find + // all files that overlap user_key and process them in order from + // newest to oldest. In the context of merge-operator, this can occur at + // any level. Otherwise, it only occurs at Level-0 (since Put/Deletes + // are always compacted into a single entry). + int32_t start_index; + if (curr_level_ == 0) { + // On Level-0, we read through all files to check for overlap. + start_index = 0; + } else { + // On Level-n (n>=1), files are sorted. Binary search to find the + // earliest file whose largest key >= ikey. Search left bound and + // right bound are used to narrow the range. + if (search_left_bound_ <= search_right_bound_) { + if (search_right_bound_ == FileIndexer::kLevelMaxIndex) { + search_right_bound_ = + static_cast(curr_file_level_->num_files) - 1; + } + // `search_right_bound_` is an inclusive upper-bound, but since it was + // determined based on user key, it is still possible the lookup key + // falls to the right of `search_right_bound_`'s corresponding file. + // So, pass a limit one higher, which allows us to detect this case. + start_index = + FindFileInRange(*internal_comparator_, *curr_file_level_, ikey_, + static_cast(search_left_bound_), + static_cast(search_right_bound_) + 1); + if (start_index == search_right_bound_ + 1) { + // `ikey_` comes after `search_right_bound_`. The lookup key does + // not exist on this level, so let's skip this level and do a full + // binary search on the next level. + search_left_bound_ = 0; + search_right_bound_ = FileIndexer::kLevelMaxIndex; + curr_level_++; + continue; + } + } else { + // search_left_bound > search_right_bound, key does not exist in + // this level. Since no comparison is done in this level, it will + // need to search all files in the next level. + search_left_bound_ = 0; + search_right_bound_ = FileIndexer::kLevelMaxIndex; + curr_level_++; + continue; + } + } + start_index_in_curr_level_ = start_index; + curr_index_in_curr_level_ = start_index; +#ifndef NDEBUG + prev_file_ = nullptr; +#endif + return true; + } + // curr_level_ = num_levels_. So, no more levels to search. + return false; + } +}; + +class FilePickerMultiGet { + private: + struct FilePickerContext; + + public: + FilePickerMultiGet(MultiGetRange* range, + autovector* file_levels, + unsigned int num_levels, FileIndexer* file_indexer, + const Comparator* user_comparator, + const InternalKeyComparator* internal_comparator) + : num_levels_(num_levels), + curr_level_(static_cast(-1)), + returned_file_level_(static_cast(-1)), + hit_file_level_(static_cast(-1)), + range_(range), + batch_iter_(range->begin()), + batch_iter_prev_(range->begin()), + maybe_repeat_key_(false), + current_level_range_(*range, range->begin(), range->end()), + current_file_range_(*range, range->begin(), range->end()), + level_files_brief_(file_levels), + is_hit_file_last_in_level_(false), + curr_file_level_(nullptr), + file_indexer_(file_indexer), + user_comparator_(user_comparator), + internal_comparator_(internal_comparator) { + for (auto iter = range_->begin(); iter != range_->end(); ++iter) { + fp_ctx_array_[iter.index()] = + FilePickerContext(0, FileIndexer::kLevelMaxIndex); + } + + // Setup member variables to search first level. + search_ended_ = !PrepareNextLevel(); + if (!search_ended_) { + // REVISIT + // Prefetch Level 0 table data to avoid cache miss if possible. + // As of now, only PlainTableReader and CuckooTableReader do any + // prefetching. This may not be necessary anymore once we implement + // batching in those table readers + for (unsigned int i = 0; i < (*level_files_brief_)[0].num_files; ++i) { + auto* r = (*level_files_brief_)[0].files[i].fd.table_reader; + if (r) { + for (auto iter = range_->begin(); iter != range_->end(); ++iter) { + r->Prepare(iter->ikey); + } + } + } + } + } + + int GetCurrentLevel() const { return curr_level_; } + + // Iterates through files in the current level until it finds a file that + // contains atleast one key from the MultiGet batch + bool GetNextFileInLevelWithKeys(MultiGetRange* next_file_range, + size_t* file_index, FdWithKeyRange** fd, + bool* is_last_key_in_file) { + size_t curr_file_index = *file_index; + FdWithKeyRange* f = nullptr; + bool file_hit = false; + int cmp_largest = -1; + if (curr_file_index >= curr_file_level_->num_files) { + // In the unlikely case the next key is a duplicate of the current key, + // and the current key is the last in the level and the internal key + // was not found, we need to skip lookup for the remaining keys and + // reset the search bounds + if (batch_iter_ != current_level_range_.end()) { + ++batch_iter_; + for (; batch_iter_ != current_level_range_.end(); ++batch_iter_) { + struct FilePickerContext& fp_ctx = fp_ctx_array_[batch_iter_.index()]; + fp_ctx.search_left_bound = 0; + fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex; + } + } + return false; + } + // Loops over keys in the MultiGet batch until it finds a file with + // atleast one of the keys. Then it keeps moving forward until the + // last key in the batch that falls in that file + while (batch_iter_ != current_level_range_.end() && + (fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level == + curr_file_index || + !file_hit)) { + struct FilePickerContext& fp_ctx = fp_ctx_array_[batch_iter_.index()]; + f = &curr_file_level_->files[fp_ctx.curr_index_in_curr_level]; + Slice& user_key = batch_iter_->ukey; + + // Do key range filtering of files or/and fractional cascading if: + // (1) not all the files are in level 0, or + // (2) there are more than 3 current level files + // If there are only 3 or less current level files in the system, we + // skip the key range filtering. In this case, more likely, the system + // is highly tuned to minimize number of tables queried by each query, + // so it is unlikely that key range filtering is more efficient than + // querying the files. + if (num_levels_ > 1 || curr_file_level_->num_files > 3) { + // Check if key is within a file's range. If search left bound and + // right bound point to the same find, we are sure key falls in + // range. + assert(curr_level_ == 0 || + fp_ctx.curr_index_in_curr_level == + fp_ctx.start_index_in_curr_level || + user_comparator_->Compare(user_key, + ExtractUserKey(f->smallest_key)) <= 0); + + int cmp_smallest = user_comparator_->Compare( + user_key, ExtractUserKey(f->smallest_key)); + if (cmp_smallest >= 0) { + cmp_largest = user_comparator_->Compare( + user_key, ExtractUserKey(f->largest_key)); + } else { + cmp_largest = -1; + } + + // Setup file search bound for the next level based on the + // comparison results + if (curr_level_ > 0) { + file_indexer_->GetNextLevelIndex( + curr_level_, fp_ctx.curr_index_in_curr_level, cmp_smallest, + cmp_largest, &fp_ctx.search_left_bound, + &fp_ctx.search_right_bound); + } + // Key falls out of current file's range + if (cmp_smallest < 0 || cmp_largest > 0) { + next_file_range->SkipKey(batch_iter_); + } else { + file_hit = true; + } + } else { + file_hit = true; + } + if (cmp_largest == 0) { + // cmp_largest is 0, which means the next key will not be in this + // file, so stop looking further. Also don't increment megt_iter_ + // as we may have to look for this key in the next file if we don't + // find it in this one + break; + } else { + if (curr_level_ == 0) { + // We need to look through all files in level 0 + ++fp_ctx.curr_index_in_curr_level; + } + ++batch_iter_; + } + if (!file_hit) { + curr_file_index = + (batch_iter_ != current_level_range_.end()) + ? fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level + : curr_file_level_->num_files; + } + } + + *fd = f; + *file_index = curr_file_index; + *is_last_key_in_file = cmp_largest == 0; + return file_hit; + } + + FdWithKeyRange* GetNextFile() { + while (!search_ended_) { + // Start searching next level. + if (batch_iter_ == current_level_range_.end()) { + search_ended_ = !PrepareNextLevel(); + continue; + } else { + if (maybe_repeat_key_) { + maybe_repeat_key_ = false; + // Check if we found the final value for the last key in the + // previous lookup range. If we did, then there's no need to look + // any further for that key, so advance batch_iter_. Else, keep + // batch_iter_ positioned on that key so we look it up again in + // the next file + // For L0, always advance the key because we will look in the next + // file regardless for all keys not found yet + if (current_level_range_.CheckKeyDone(batch_iter_) || + curr_level_ == 0) { + ++batch_iter_; + } + } + // batch_iter_prev_ will become the start key for the next file + // lookup + batch_iter_prev_ = batch_iter_; + } + + MultiGetRange next_file_range(current_level_range_, batch_iter_prev_, + current_level_range_.end()); + size_t curr_file_index = + (batch_iter_ != current_level_range_.end()) + ? fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level + : curr_file_level_->num_files; + FdWithKeyRange* f; + bool is_last_key_in_file; + if (!GetNextFileInLevelWithKeys(&next_file_range, &curr_file_index, &f, + &is_last_key_in_file)) { + search_ended_ = !PrepareNextLevel(); + } else { + MultiGetRange::Iterator upper_key = batch_iter_; + if (is_last_key_in_file) { + // Since cmp_largest is 0, batch_iter_ still points to the last key + // that falls in this file, instead of the next one. Increment + // upper_key so we can set the range properly for SST MultiGet + ++upper_key; + ++(fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level); + maybe_repeat_key_ = true; + } + // Set the range for this file + current_file_range_ = + MultiGetRange(next_file_range, batch_iter_prev_, upper_key); + returned_file_level_ = curr_level_; + hit_file_level_ = curr_level_; + is_hit_file_last_in_level_ = + curr_file_index == curr_file_level_->num_files - 1; + return f; + } + } + + // Search ended + return nullptr; + } + + // getter for current file level + // for GET_HIT_L0, GET_HIT_L1 & GET_HIT_L2_AND_UP counts + unsigned int GetHitFileLevel() { return hit_file_level_; } + + // Returns true if the most recent "hit file" (i.e., one returned by + // GetNextFile()) is at the last index in its level. + bool IsHitFileLastInLevel() { return is_hit_file_last_in_level_; } + + const MultiGetRange& CurrentFileRange() { return current_file_range_; } + + private: + unsigned int num_levels_; + unsigned int curr_level_; + unsigned int returned_file_level_; + unsigned int hit_file_level_; + + struct FilePickerContext { + int32_t search_left_bound; + int32_t search_right_bound; + unsigned int curr_index_in_curr_level; + unsigned int start_index_in_curr_level; + + FilePickerContext(int32_t left, int32_t right) + : search_left_bound(left), search_right_bound(right), + curr_index_in_curr_level(0), start_index_in_curr_level(0) {} + + FilePickerContext() = default; + }; + std::array fp_ctx_array_; + MultiGetRange* range_; + // Iterator to iterate through the keys in a MultiGet batch, that gets reset + // at the beginning of each level. Each call to GetNextFile() will position + // batch_iter_ at or right after the last key that was found in the returned + // SST file + MultiGetRange::Iterator batch_iter_; + // An iterator that records the previous position of batch_iter_, i.e last + // key found in the previous SST file, in order to serve as the start of + // the batch key range for the next SST file + MultiGetRange::Iterator batch_iter_prev_; + bool maybe_repeat_key_; + MultiGetRange current_level_range_; + MultiGetRange current_file_range_; + autovector* level_files_brief_; + bool search_ended_; + bool is_hit_file_last_in_level_; + LevelFilesBrief* curr_file_level_; + FileIndexer* file_indexer_; + const Comparator* user_comparator_; + const InternalKeyComparator* internal_comparator_; + + // Setup local variables to search next level. + // Returns false if there are no more levels to search. + bool PrepareNextLevel() { + if (curr_level_ == 0) { + MultiGetRange::Iterator mget_iter = current_level_range_.begin(); + if (fp_ctx_array_[mget_iter.index()].curr_index_in_curr_level < + curr_file_level_->num_files) { + batch_iter_prev_ = current_level_range_.begin(); + batch_iter_ = current_level_range_.begin(); + return true; + } + } + + curr_level_++; + // Reset key range to saved value + while (curr_level_ < num_levels_) { + bool level_contains_keys = false; + curr_file_level_ = &(*level_files_brief_)[curr_level_]; + if (curr_file_level_->num_files == 0) { + // When current level is empty, the search bound generated from upper + // level must be [0, -1] or [0, FileIndexer::kLevelMaxIndex] if it is + // also empty. + + for (auto mget_iter = current_level_range_.begin(); + mget_iter != current_level_range_.end(); ++mget_iter) { + struct FilePickerContext& fp_ctx = fp_ctx_array_[mget_iter.index()]; + + assert(fp_ctx.search_left_bound == 0); + assert(fp_ctx.search_right_bound == -1 || + fp_ctx.search_right_bound == FileIndexer::kLevelMaxIndex); + // Since current level is empty, it will need to search all files in + // the next level + fp_ctx.search_left_bound = 0; + fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex; + } + // Skip all subsequent empty levels + do { + ++curr_level_; + } while ((curr_level_ < num_levels_) && + (*level_files_brief_)[curr_level_].num_files == 0); + continue; + } + + // Some files may overlap each other. We find + // all files that overlap user_key and process them in order from + // newest to oldest. In the context of merge-operator, this can occur at + // any level. Otherwise, it only occurs at Level-0 (since Put/Deletes + // are always compacted into a single entry). + int32_t start_index = -1; + current_level_range_ = + MultiGetRange(*range_, range_->begin(), range_->end()); + for (auto mget_iter = current_level_range_.begin(); + mget_iter != current_level_range_.end(); ++mget_iter) { + struct FilePickerContext& fp_ctx = fp_ctx_array_[mget_iter.index()]; + if (curr_level_ == 0) { + // On Level-0, we read through all files to check for overlap. + start_index = 0; + level_contains_keys = true; + } else { + // On Level-n (n>=1), files are sorted. Binary search to find the + // earliest file whose largest key >= ikey. Search left bound and + // right bound are used to narrow the range. + if (fp_ctx.search_left_bound <= fp_ctx.search_right_bound) { + if (fp_ctx.search_right_bound == FileIndexer::kLevelMaxIndex) { + fp_ctx.search_right_bound = + static_cast(curr_file_level_->num_files) - 1; + } + // `search_right_bound_` is an inclusive upper-bound, but since it + // was determined based on user key, it is still possible the lookup + // key falls to the right of `search_right_bound_`'s corresponding + // file. So, pass a limit one higher, which allows us to detect this + // case. + Slice& ikey = mget_iter->ikey; + start_index = FindFileInRange( + *internal_comparator_, *curr_file_level_, ikey, + static_cast(fp_ctx.search_left_bound), + static_cast(fp_ctx.search_right_bound) + 1); + if (start_index == fp_ctx.search_right_bound + 1) { + // `ikey_` comes after `search_right_bound_`. The lookup key does + // not exist on this level, so let's skip this level and do a full + // binary search on the next level. + fp_ctx.search_left_bound = 0; + fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex; + current_level_range_.SkipKey(mget_iter); + continue; + } else { + level_contains_keys = true; + } + } else { + // search_left_bound > search_right_bound, key does not exist in + // this level. Since no comparison is done in this level, it will + // need to search all files in the next level. + fp_ctx.search_left_bound = 0; + fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex; + current_level_range_.SkipKey(mget_iter); + continue; + } + } + fp_ctx.start_index_in_curr_level = start_index; + fp_ctx.curr_index_in_curr_level = start_index; + } + if (level_contains_keys) { + batch_iter_prev_ = current_level_range_.begin(); + batch_iter_ = current_level_range_.begin(); + return true; + } + curr_level_++; + } + // curr_level_ = num_levels_. So, no more levels to search. + return false; + } +}; +} // anonymous namespace + +VersionStorageInfo::~VersionStorageInfo() { delete[] files_; } + +Version::~Version() { + assert(refs_ == 0); + + // Remove from linked list + prev_->next_ = next_; + next_->prev_ = prev_; + + // Drop references to files + for (int level = 0; level < storage_info_.num_levels_; level++) { + for (size_t i = 0; i < storage_info_.files_[level].size(); i++) { + FileMetaData* f = storage_info_.files_[level][i]; + assert(f->refs > 0); + f->refs--; + if (f->refs <= 0) { + assert(cfd_ != nullptr); + uint32_t path_id = f->fd.GetPathId(); + assert(path_id < cfd_->ioptions()->cf_paths.size()); + vset_->obsolete_files_.push_back( + ObsoleteFileInfo(f, cfd_->ioptions()->cf_paths[path_id].path)); + } + } + } +} + +int FindFile(const InternalKeyComparator& icmp, + const LevelFilesBrief& file_level, + const Slice& key) { + return FindFileInRange(icmp, file_level, key, 0, + static_cast(file_level.num_files)); +} + +void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level, + const std::vector& files, + Arena* arena) { + assert(file_level); + assert(arena); + + size_t num = files.size(); + file_level->num_files = num; + char* mem = arena->AllocateAligned(num * sizeof(FdWithKeyRange)); + file_level->files = new (mem)FdWithKeyRange[num]; + + for (size_t i = 0; i < num; i++) { + Slice smallest_key = files[i]->smallest.Encode(); + Slice largest_key = files[i]->largest.Encode(); + + // Copy key slice to sequential memory + size_t smallest_size = smallest_key.size(); + size_t largest_size = largest_key.size(); + mem = arena->AllocateAligned(smallest_size + largest_size); + memcpy(mem, smallest_key.data(), smallest_size); + memcpy(mem + smallest_size, largest_key.data(), largest_size); + + FdWithKeyRange& f = file_level->files[i]; + f.fd = files[i]->fd; + f.file_metadata = files[i]; + f.smallest_key = Slice(mem, smallest_size); + f.largest_key = Slice(mem + smallest_size, largest_size); + } +} + +static bool AfterFile(const Comparator* ucmp, + const Slice* user_key, const FdWithKeyRange* f) { + // nullptr user_key occurs before all keys and is therefore never after *f + return (user_key != nullptr && + ucmp->CompareWithoutTimestamp(*user_key, + ExtractUserKey(f->largest_key)) > 0); +} + +static bool BeforeFile(const Comparator* ucmp, + const Slice* user_key, const FdWithKeyRange* f) { + // nullptr user_key occurs after all keys and is therefore never before *f + return (user_key != nullptr && + ucmp->CompareWithoutTimestamp(*user_key, + ExtractUserKey(f->smallest_key)) < 0); +} + +bool SomeFileOverlapsRange( + const InternalKeyComparator& icmp, + bool disjoint_sorted_files, + const LevelFilesBrief& file_level, + const Slice* smallest_user_key, + const Slice* largest_user_key) { + const Comparator* ucmp = icmp.user_comparator(); + if (!disjoint_sorted_files) { + // Need to check against all files + for (size_t i = 0; i < file_level.num_files; i++) { + const FdWithKeyRange* f = &(file_level.files[i]); + if (AfterFile(ucmp, smallest_user_key, f) || + BeforeFile(ucmp, largest_user_key, f)) { + // No overlap + } else { + return true; // Overlap + } + } + return false; + } + + // Binary search over file list + uint32_t index = 0; + if (smallest_user_key != nullptr) { + // Find the leftmost possible internal key for smallest_user_key + InternalKey small; + small.SetMinPossibleForUserKey(*smallest_user_key); + index = FindFile(icmp, file_level, small.Encode()); + } + + if (index >= file_level.num_files) { + // beginning of range is after all files, so no overlap. + return false; + } + + return !BeforeFile(ucmp, largest_user_key, &file_level.files[index]); +} + +namespace { + +class LevelIterator final : public InternalIterator { + public: + LevelIterator(TableCache* table_cache, const ReadOptions& read_options, + const FileOptions& file_options, + const InternalKeyComparator& icomparator, + const LevelFilesBrief* flevel, + const SliceTransform* prefix_extractor, bool should_sample, + HistogramImpl* file_read_hist, TableReaderCaller caller, + bool skip_filters, int level, RangeDelAggregator* range_del_agg, + const std::vector* + compaction_boundaries = nullptr) + : table_cache_(table_cache), + read_options_(read_options), + file_options_(file_options), + icomparator_(icomparator), + user_comparator_(icomparator.user_comparator()), + flevel_(flevel), + prefix_extractor_(prefix_extractor), + file_read_hist_(file_read_hist), + should_sample_(should_sample), + caller_(caller), + skip_filters_(skip_filters), + file_index_(flevel_->num_files), + level_(level), + range_del_agg_(range_del_agg), + pinned_iters_mgr_(nullptr), + compaction_boundaries_(compaction_boundaries) { + // Empty level is not supported. + assert(flevel_ != nullptr && flevel_->num_files > 0); + } + + ~LevelIterator() override { delete file_iter_.Set(nullptr); } + + void Seek(const Slice& target) override; + void SeekForPrev(const Slice& target) override; + void SeekToFirst() override; + void SeekToLast() override; + void Next() final override; + bool NextAndGetResult(IterateResult* result) override; + void Prev() override; + + bool Valid() const override { return file_iter_.Valid(); } + Slice key() const override { + assert(Valid()); + return file_iter_.key(); + } + + Slice value() const override { + assert(Valid()); + return file_iter_.value(); + } + + Status status() const override { + return file_iter_.iter() ? file_iter_.status() : Status::OK(); + } + + inline bool MayBeOutOfLowerBound() override { + assert(Valid()); + return may_be_out_of_lower_bound_ && file_iter_.MayBeOutOfLowerBound(); + } + + inline bool MayBeOutOfUpperBound() override { + assert(Valid()); + return file_iter_.MayBeOutOfUpperBound(); + } + + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { + pinned_iters_mgr_ = pinned_iters_mgr; + if (file_iter_.iter()) { + file_iter_.SetPinnedItersMgr(pinned_iters_mgr); + } + } + + bool IsKeyPinned() const override { + return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && + file_iter_.iter() && file_iter_.IsKeyPinned(); + } + + bool IsValuePinned() const override { + return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && + file_iter_.iter() && file_iter_.IsValuePinned(); + } + + private: + // Return true if at least one invalid file is seen and skipped. + bool SkipEmptyFileForward(); + void SkipEmptyFileBackward(); + void SetFileIterator(InternalIterator* iter); + void InitFileIterator(size_t new_file_index); + + // Called by both of Next() and NextAndGetResult(). Force inline. + void NextImpl() { + assert(Valid()); + file_iter_.Next(); + SkipEmptyFileForward(); + } + + const Slice& file_smallest_key(size_t file_index) { + assert(file_index < flevel_->num_files); + return flevel_->files[file_index].smallest_key; + } + + bool KeyReachedUpperBound(const Slice& internal_key) { + return read_options_.iterate_upper_bound != nullptr && + user_comparator_.CompareWithoutTimestamp( + ExtractUserKey(internal_key), + *read_options_.iterate_upper_bound) >= 0; + } + + InternalIterator* NewFileIterator() { + assert(file_index_ < flevel_->num_files); + auto file_meta = flevel_->files[file_index_]; + if (should_sample_) { + sample_file_read_inc(file_meta.file_metadata); + } + + const InternalKey* smallest_compaction_key = nullptr; + const InternalKey* largest_compaction_key = nullptr; + if (compaction_boundaries_ != nullptr) { + smallest_compaction_key = (*compaction_boundaries_)[file_index_].smallest; + largest_compaction_key = (*compaction_boundaries_)[file_index_].largest; + } + CheckMayBeOutOfLowerBound(); + return table_cache_->NewIterator( + read_options_, file_options_, icomparator_, *file_meta.file_metadata, + range_del_agg_, prefix_extractor_, + nullptr /* don't need reference to table */, file_read_hist_, caller_, + /*arena=*/nullptr, skip_filters_, level_, smallest_compaction_key, + largest_compaction_key); + } + + // Check if current file being fully within iterate_lower_bound. + // + // Note MyRocks may update iterate bounds between seek. To workaround it, + // we need to check and update may_be_out_of_lower_bound_ accordingly. + void CheckMayBeOutOfLowerBound() { + if (read_options_.iterate_lower_bound != nullptr && + file_index_ < flevel_->num_files) { + may_be_out_of_lower_bound_ = + user_comparator_.Compare( + ExtractUserKey(file_smallest_key(file_index_)), + *read_options_.iterate_lower_bound) < 0; + } + } + + TableCache* table_cache_; + const ReadOptions read_options_; + const FileOptions& file_options_; + const InternalKeyComparator& icomparator_; + const UserComparatorWrapper user_comparator_; + const LevelFilesBrief* flevel_; + mutable FileDescriptor current_value_; + // `prefix_extractor_` may be non-null even for total order seek. Checking + // this variable is not the right way to identify whether prefix iterator + // is used. + const SliceTransform* prefix_extractor_; + + HistogramImpl* file_read_hist_; + bool should_sample_; + TableReaderCaller caller_; + bool skip_filters_; + bool may_be_out_of_lower_bound_ = true; + size_t file_index_; + int level_; + RangeDelAggregator* range_del_agg_; + IteratorWrapper file_iter_; // May be nullptr + PinnedIteratorsManager* pinned_iters_mgr_; + + // To be propagated to RangeDelAggregator in order to safely truncate range + // tombstones. + const std::vector* compaction_boundaries_; +}; + +void LevelIterator::Seek(const Slice& target) { + // Check whether the seek key fall under the same file + bool need_to_reseek = true; + if (file_iter_.iter() != nullptr && file_index_ < flevel_->num_files) { + const FdWithKeyRange& cur_file = flevel_->files[file_index_]; + if (icomparator_.InternalKeyComparator::Compare( + target, cur_file.largest_key) <= 0 && + icomparator_.InternalKeyComparator::Compare( + target, cur_file.smallest_key) >= 0) { + need_to_reseek = false; + assert(static_cast(FindFile(icomparator_, *flevel_, target)) == + file_index_); + } + } + if (need_to_reseek) { + TEST_SYNC_POINT("LevelIterator::Seek:BeforeFindFile"); + size_t new_file_index = FindFile(icomparator_, *flevel_, target); + InitFileIterator(new_file_index); + } + + if (file_iter_.iter() != nullptr) { + file_iter_.Seek(target); + } + if (SkipEmptyFileForward() && prefix_extractor_ != nullptr && + !read_options_.total_order_seek && !read_options_.auto_prefix_mode && + file_iter_.iter() != nullptr && file_iter_.Valid()) { + // We've skipped the file we initially positioned to. In the prefix + // seek case, it is likely that the file is skipped because of + // prefix bloom or hash, where more keys are skipped. We then check + // the current key and invalidate the iterator if the prefix is + // already passed. + // When doing prefix iterator seek, when keys for one prefix have + // been exhausted, it can jump to any key that is larger. Here we are + // enforcing a stricter contract than that, in order to make it easier for + // higher layers (merging and DB iterator) to reason the correctness: + // 1. Within the prefix, the result should be accurate. + // 2. If keys for the prefix is exhausted, it is either positioned to the + // next key after the prefix, or make the iterator invalid. + // A side benefit will be that it invalidates the iterator earlier so that + // the upper level merging iterator can merge fewer child iterators. + Slice target_user_key = ExtractUserKey(target); + Slice file_user_key = ExtractUserKey(file_iter_.key()); + if (prefix_extractor_->InDomain(target_user_key) && + (!prefix_extractor_->InDomain(file_user_key) || + user_comparator_.Compare( + prefix_extractor_->Transform(target_user_key), + prefix_extractor_->Transform(file_user_key)) != 0)) { + SetFileIterator(nullptr); + } + } + CheckMayBeOutOfLowerBound(); +} + +void LevelIterator::SeekForPrev(const Slice& target) { + size_t new_file_index = FindFile(icomparator_, *flevel_, target); + if (new_file_index >= flevel_->num_files) { + new_file_index = flevel_->num_files - 1; + } + + InitFileIterator(new_file_index); + if (file_iter_.iter() != nullptr) { + file_iter_.SeekForPrev(target); + SkipEmptyFileBackward(); + } + CheckMayBeOutOfLowerBound(); +} + +void LevelIterator::SeekToFirst() { + InitFileIterator(0); + if (file_iter_.iter() != nullptr) { + file_iter_.SeekToFirst(); + } + SkipEmptyFileForward(); + CheckMayBeOutOfLowerBound(); +} + +void LevelIterator::SeekToLast() { + InitFileIterator(flevel_->num_files - 1); + if (file_iter_.iter() != nullptr) { + file_iter_.SeekToLast(); + } + SkipEmptyFileBackward(); + CheckMayBeOutOfLowerBound(); +} + +void LevelIterator::Next() { NextImpl(); } + +bool LevelIterator::NextAndGetResult(IterateResult* result) { + NextImpl(); + bool is_valid = Valid(); + if (is_valid) { + result->key = key(); + result->may_be_out_of_upper_bound = MayBeOutOfUpperBound(); + } + return is_valid; +} + +void LevelIterator::Prev() { + assert(Valid()); + file_iter_.Prev(); + SkipEmptyFileBackward(); +} + +bool LevelIterator::SkipEmptyFileForward() { + bool seen_empty_file = false; + while (file_iter_.iter() == nullptr || + (!file_iter_.Valid() && file_iter_.status().ok() && + !file_iter_.iter()->IsOutOfBound())) { + seen_empty_file = true; + // Move to next file + if (file_index_ >= flevel_->num_files - 1) { + // Already at the last file + SetFileIterator(nullptr); + break; + } + if (KeyReachedUpperBound(file_smallest_key(file_index_ + 1))) { + SetFileIterator(nullptr); + break; + } + InitFileIterator(file_index_ + 1); + if (file_iter_.iter() != nullptr) { + file_iter_.SeekToFirst(); + } + } + return seen_empty_file; +} + +void LevelIterator::SkipEmptyFileBackward() { + while (file_iter_.iter() == nullptr || + (!file_iter_.Valid() && file_iter_.status().ok())) { + // Move to previous file + if (file_index_ == 0) { + // Already the first file + SetFileIterator(nullptr); + return; + } + InitFileIterator(file_index_ - 1); + if (file_iter_.iter() != nullptr) { + file_iter_.SeekToLast(); + } + } +} + +void LevelIterator::SetFileIterator(InternalIterator* iter) { + if (pinned_iters_mgr_ && iter) { + iter->SetPinnedItersMgr(pinned_iters_mgr_); + } + + InternalIterator* old_iter = file_iter_.Set(iter); + if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) { + pinned_iters_mgr_->PinIterator(old_iter); + } else { + delete old_iter; + } +} + +void LevelIterator::InitFileIterator(size_t new_file_index) { + if (new_file_index >= flevel_->num_files) { + file_index_ = new_file_index; + SetFileIterator(nullptr); + return; + } else { + // If the file iterator shows incomplete, we try it again if users seek + // to the same file, as this time we may go to a different data block + // which is cached in block cache. + // + if (file_iter_.iter() != nullptr && !file_iter_.status().IsIncomplete() && + new_file_index == file_index_) { + // file_iter_ is already constructed with this iterator, so + // no need to change anything + } else { + file_index_ = new_file_index; + InternalIterator* iter = NewFileIterator(); + SetFileIterator(iter); + } + } +} +} // anonymous namespace + +// A wrapper of version builder which references the current version in +// constructor and unref it in the destructor. +// Both of the constructor and destructor need to be called inside DB Mutex. +class BaseReferencedVersionBuilder { + public: + explicit BaseReferencedVersionBuilder(ColumnFamilyData* cfd) + : version_builder_(new VersionBuilder( + cfd->current()->version_set()->file_options(), cfd->table_cache(), + cfd->current()->storage_info(), cfd->ioptions()->info_log)), + version_(cfd->current()) { + version_->Ref(); + } + ~BaseReferencedVersionBuilder() { + version_->Unref(); + } + VersionBuilder* version_builder() { return version_builder_.get(); } + + private: + std::unique_ptr version_builder_; + Version* version_; +}; + +Status Version::GetTableProperties(std::shared_ptr* tp, + const FileMetaData* file_meta, + const std::string* fname) const { + auto table_cache = cfd_->table_cache(); + auto ioptions = cfd_->ioptions(); + Status s = table_cache->GetTableProperties( + file_options_, cfd_->internal_comparator(), file_meta->fd, tp, + mutable_cf_options_.prefix_extractor.get(), true /* no io */); + if (s.ok()) { + return s; + } + + // We only ignore error type `Incomplete` since it's by design that we + // disallow table when it's not in table cache. + if (!s.IsIncomplete()) { + return s; + } + + // 2. Table is not present in table cache, we'll read the table properties + // directly from the properties block in the file. + std::unique_ptr file; + std::string file_name; + if (fname != nullptr) { + file_name = *fname; + } else { + file_name = + TableFileName(ioptions->cf_paths, file_meta->fd.GetNumber(), + file_meta->fd.GetPathId()); + } + s = ioptions->fs->NewRandomAccessFile(file_name, file_options_, &file, + nullptr); + if (!s.ok()) { + return s; + } + + TableProperties* raw_table_properties; + // By setting the magic number to kInvalidTableMagicNumber, we can by + // pass the magic number check in the footer. + std::unique_ptr file_reader( + new RandomAccessFileReader( + std::move(file), file_name, nullptr /* env */, nullptr /* stats */, + 0 /* hist_type */, nullptr /* file_read_hist */, + nullptr /* rate_limiter */, ioptions->listeners)); + s = ReadTableProperties( + file_reader.get(), file_meta->fd.GetFileSize(), + Footer::kInvalidTableMagicNumber /* table's magic number */, *ioptions, + &raw_table_properties, false /* compression_type_missing */); + if (!s.ok()) { + return s; + } + RecordTick(ioptions->statistics, NUMBER_DIRECT_LOAD_TABLE_PROPERTIES); + + *tp = std::shared_ptr(raw_table_properties); + return s; +} + +Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) { + Status s; + for (int level = 0; level < storage_info_.num_levels_; level++) { + s = GetPropertiesOfAllTables(props, level); + if (!s.ok()) { + return s; + } + } + + return Status::OK(); +} + +Status Version::TablesRangeTombstoneSummary(int max_entries_to_print, + std::string* out_str) { + if (max_entries_to_print <= 0) { + return Status::OK(); + } + int num_entries_left = max_entries_to_print; + + std::stringstream ss; + + for (int level = 0; level < storage_info_.num_levels_; level++) { + for (const auto& file_meta : storage_info_.files_[level]) { + auto fname = + TableFileName(cfd_->ioptions()->cf_paths, file_meta->fd.GetNumber(), + file_meta->fd.GetPathId()); + + ss << "=== file : " << fname << " ===\n"; + + TableCache* table_cache = cfd_->table_cache(); + std::unique_ptr tombstone_iter; + + Status s = table_cache->GetRangeTombstoneIterator( + ReadOptions(), cfd_->internal_comparator(), *file_meta, + &tombstone_iter); + if (!s.ok()) { + return s; + } + if (tombstone_iter) { + tombstone_iter->SeekToFirst(); + + while (tombstone_iter->Valid() && num_entries_left > 0) { + ss << "start: " << tombstone_iter->start_key().ToString(true) + << " end: " << tombstone_iter->end_key().ToString(true) + << " seq: " << tombstone_iter->seq() << '\n'; + tombstone_iter->Next(); + num_entries_left--; + } + if (num_entries_left <= 0) { + break; + } + } + } + if (num_entries_left <= 0) { + break; + } + } + assert(num_entries_left >= 0); + if (num_entries_left <= 0) { + ss << "(results may not be complete)\n"; + } + + *out_str = ss.str(); + return Status::OK(); +} + +Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props, + int level) { + for (const auto& file_meta : storage_info_.files_[level]) { + auto fname = + TableFileName(cfd_->ioptions()->cf_paths, file_meta->fd.GetNumber(), + file_meta->fd.GetPathId()); + // 1. If the table is already present in table cache, load table + // properties from there. + std::shared_ptr table_properties; + Status s = GetTableProperties(&table_properties, file_meta, &fname); + if (s.ok()) { + props->insert({fname, table_properties}); + } else { + return s; + } + } + + return Status::OK(); +} + +Status Version::GetPropertiesOfTablesInRange( + const Range* range, std::size_t n, TablePropertiesCollection* props) const { + for (int level = 0; level < storage_info_.num_non_empty_levels(); level++) { + for (decltype(n) i = 0; i < n; i++) { + // Convert user_key into a corresponding internal key. + InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek); + InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek); + std::vector files; + storage_info_.GetOverlappingInputs(level, &k1, &k2, &files, -1, nullptr, + false); + for (const auto& file_meta : files) { + auto fname = + TableFileName(cfd_->ioptions()->cf_paths, + file_meta->fd.GetNumber(), file_meta->fd.GetPathId()); + if (props->count(fname) == 0) { + // 1. If the table is already present in table cache, load table + // properties from there. + std::shared_ptr table_properties; + Status s = GetTableProperties(&table_properties, file_meta, &fname); + if (s.ok()) { + props->insert({fname, table_properties}); + } else { + return s; + } + } + } + } + } + + return Status::OK(); +} + +Status Version::GetAggregatedTableProperties( + std::shared_ptr* tp, int level) { + TablePropertiesCollection props; + Status s; + if (level < 0) { + s = GetPropertiesOfAllTables(&props); + } else { + s = GetPropertiesOfAllTables(&props, level); + } + if (!s.ok()) { + return s; + } + + auto* new_tp = new TableProperties(); + for (const auto& item : props) { + new_tp->Add(*item.second); + } + tp->reset(new_tp); + return Status::OK(); +} + +size_t Version::GetMemoryUsageByTableReaders() { + size_t total_usage = 0; + for (auto& file_level : storage_info_.level_files_brief_) { + for (size_t i = 0; i < file_level.num_files; i++) { + total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader( + file_options_, cfd_->internal_comparator(), file_level.files[i].fd, + mutable_cf_options_.prefix_extractor.get()); + } + } + return total_usage; +} + +void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) { + assert(cf_meta); + assert(cfd_); + + cf_meta->name = cfd_->GetName(); + cf_meta->size = 0; + cf_meta->file_count = 0; + cf_meta->levels.clear(); + + auto* ioptions = cfd_->ioptions(); + auto* vstorage = storage_info(); + + for (int level = 0; level < cfd_->NumberLevels(); level++) { + uint64_t level_size = 0; + cf_meta->file_count += vstorage->LevelFiles(level).size(); + std::vector files; + for (const auto& file : vstorage->LevelFiles(level)) { + uint32_t path_id = file->fd.GetPathId(); + std::string file_path; + if (path_id < ioptions->cf_paths.size()) { + file_path = ioptions->cf_paths[path_id].path; + } else { + assert(!ioptions->cf_paths.empty()); + file_path = ioptions->cf_paths.back().path; + } + const uint64_t file_number = file->fd.GetNumber(); + files.emplace_back(SstFileMetaData{ + MakeTableFileName("", file_number), file_number, file_path, + static_cast(file->fd.GetFileSize()), file->fd.smallest_seqno, + file->fd.largest_seqno, file->smallest.user_key().ToString(), + file->largest.user_key().ToString(), + file->stats.num_reads_sampled.load(std::memory_order_relaxed), + file->being_compacted, file->oldest_blob_file_number, + file->TryGetOldestAncesterTime(), file->TryGetFileCreationTime(), + file->file_checksum, file->file_checksum_func_name}); + files.back().num_entries = file->num_entries; + files.back().num_deletions = file->num_deletions; + level_size += file->fd.GetFileSize(); + } + cf_meta->levels.emplace_back( + level, level_size, std::move(files)); + cf_meta->size += level_size; + } +} + +uint64_t Version::GetSstFilesSize() { + uint64_t sst_files_size = 0; + for (int level = 0; level < storage_info_.num_levels_; level++) { + for (const auto& file_meta : storage_info_.LevelFiles(level)) { + sst_files_size += file_meta->fd.GetFileSize(); + } + } + return sst_files_size; +} + +void Version::GetCreationTimeOfOldestFile(uint64_t* creation_time) { + uint64_t oldest_time = port::kMaxUint64; + for (int level = 0; level < storage_info_.num_non_empty_levels_; level++) { + for (FileMetaData* meta : storage_info_.LevelFiles(level)) { + assert(meta->fd.table_reader != nullptr); + uint64_t file_creation_time = meta->TryGetFileCreationTime(); + if (file_creation_time == kUnknownFileCreationTime) { + *creation_time = 0; + return; + } + if (file_creation_time < oldest_time) { + oldest_time = file_creation_time; + } + } + } + *creation_time = oldest_time; +} + +uint64_t VersionStorageInfo::GetEstimatedActiveKeys() const { + // Estimation will be inaccurate when: + // (1) there exist merge keys + // (2) keys are directly overwritten + // (3) deletion on non-existing keys + // (4) low number of samples + if (current_num_samples_ == 0) { + return 0; + } + + if (current_num_non_deletions_ <= current_num_deletions_) { + return 0; + } + + uint64_t est = current_num_non_deletions_ - current_num_deletions_; + + uint64_t file_count = 0; + for (int level = 0; level < num_levels_; ++level) { + file_count += files_[level].size(); + } + + if (current_num_samples_ < file_count) { + // casting to avoid overflowing + return + static_cast( + (est * static_cast(file_count) / current_num_samples_) + ); + } else { + return est; + } +} + +double VersionStorageInfo::GetEstimatedCompressionRatioAtLevel( + int level) const { + assert(level < num_levels_); + uint64_t sum_file_size_bytes = 0; + uint64_t sum_data_size_bytes = 0; + for (auto* file_meta : files_[level]) { + sum_file_size_bytes += file_meta->fd.GetFileSize(); + sum_data_size_bytes += file_meta->raw_key_size + file_meta->raw_value_size; + } + if (sum_file_size_bytes == 0) { + return -1.0; + } + return static_cast(sum_data_size_bytes) / sum_file_size_bytes; +} + +void Version::AddIterators(const ReadOptions& read_options, + const FileOptions& soptions, + MergeIteratorBuilder* merge_iter_builder, + RangeDelAggregator* range_del_agg) { + assert(storage_info_.finalized_); + + for (int level = 0; level < storage_info_.num_non_empty_levels(); level++) { + AddIteratorsForLevel(read_options, soptions, merge_iter_builder, level, + range_del_agg); + } +} + +void Version::AddIteratorsForLevel(const ReadOptions& read_options, + const FileOptions& soptions, + MergeIteratorBuilder* merge_iter_builder, + int level, + RangeDelAggregator* range_del_agg) { + assert(storage_info_.finalized_); + if (level >= storage_info_.num_non_empty_levels()) { + // This is an empty level + return; + } else if (storage_info_.LevelFilesBrief(level).num_files == 0) { + // No files in this level + return; + } + + bool should_sample = should_sample_file_read(); + + auto* arena = merge_iter_builder->GetArena(); + if (level == 0) { + // Merge all level zero files together since they may overlap + for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) { + const auto& file = storage_info_.LevelFilesBrief(0).files[i]; + merge_iter_builder->AddIterator(cfd_->table_cache()->NewIterator( + read_options, soptions, cfd_->internal_comparator(), + *file.file_metadata, range_del_agg, + mutable_cf_options_.prefix_extractor.get(), nullptr, + cfd_->internal_stats()->GetFileReadHist(0), + TableReaderCaller::kUserIterator, arena, + /*skip_filters=*/false, /*level=*/0, + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr)); + } + if (should_sample) { + // Count ones for every L0 files. This is done per iterator creation + // rather than Seek(), while files in other levels are recored per seek. + // If users execute one range query per iterator, there may be some + // discrepancy here. + for (FileMetaData* meta : storage_info_.LevelFiles(0)) { + sample_file_read_inc(meta); + } + } + } else if (storage_info_.LevelFilesBrief(level).num_files > 0) { + // For levels > 0, we can use a concatenating iterator that sequentially + // walks through the non-overlapping files in the level, opening them + // lazily. + auto* mem = arena->AllocateAligned(sizeof(LevelIterator)); + merge_iter_builder->AddIterator(new (mem) LevelIterator( + cfd_->table_cache(), read_options, soptions, + cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level), + mutable_cf_options_.prefix_extractor.get(), should_sample_file_read(), + cfd_->internal_stats()->GetFileReadHist(level), + TableReaderCaller::kUserIterator, IsFilterSkipped(level), level, + range_del_agg, /*largest_compaction_key=*/nullptr)); + } +} + +Status Version::OverlapWithLevelIterator(const ReadOptions& read_options, + const FileOptions& file_options, + const Slice& smallest_user_key, + const Slice& largest_user_key, + int level, bool* overlap) { + assert(storage_info_.finalized_); + + auto icmp = cfd_->internal_comparator(); + auto ucmp = icmp.user_comparator(); + + Arena arena; + Status status; + ReadRangeDelAggregator range_del_agg(&icmp, + kMaxSequenceNumber /* upper_bound */); + + *overlap = false; + + if (level == 0) { + for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) { + const auto file = &storage_info_.LevelFilesBrief(0).files[i]; + if (AfterFile(ucmp, &smallest_user_key, file) || + BeforeFile(ucmp, &largest_user_key, file)) { + continue; + } + ScopedArenaIterator iter(cfd_->table_cache()->NewIterator( + read_options, file_options, cfd_->internal_comparator(), + *file->file_metadata, &range_del_agg, + mutable_cf_options_.prefix_extractor.get(), nullptr, + cfd_->internal_stats()->GetFileReadHist(0), + TableReaderCaller::kUserIterator, &arena, + /*skip_filters=*/false, /*level=*/0, + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr)); + status = OverlapWithIterator( + ucmp, smallest_user_key, largest_user_key, iter.get(), overlap); + if (!status.ok() || *overlap) { + break; + } + } + } else if (storage_info_.LevelFilesBrief(level).num_files > 0) { + auto mem = arena.AllocateAligned(sizeof(LevelIterator)); + ScopedArenaIterator iter(new (mem) LevelIterator( + cfd_->table_cache(), read_options, file_options, + cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level), + mutable_cf_options_.prefix_extractor.get(), should_sample_file_read(), + cfd_->internal_stats()->GetFileReadHist(level), + TableReaderCaller::kUserIterator, IsFilterSkipped(level), level, + &range_del_agg)); + status = OverlapWithIterator( + ucmp, smallest_user_key, largest_user_key, iter.get(), overlap); + } + + if (status.ok() && *overlap == false && + range_del_agg.IsRangeOverlapped(smallest_user_key, largest_user_key)) { + *overlap = true; + } + return status; +} + +VersionStorageInfo::VersionStorageInfo( + const InternalKeyComparator* internal_comparator, + const Comparator* user_comparator, int levels, + CompactionStyle compaction_style, VersionStorageInfo* ref_vstorage, + bool _force_consistency_checks) + : internal_comparator_(internal_comparator), + user_comparator_(user_comparator), + // cfd is nullptr if Version is dummy + num_levels_(levels), + num_non_empty_levels_(0), + file_indexer_(user_comparator), + compaction_style_(compaction_style), + files_(new std::vector[num_levels_]), + base_level_(num_levels_ == 1 ? -1 : 1), + level_multiplier_(0.0), + files_by_compaction_pri_(num_levels_), + level0_non_overlapping_(false), + next_file_to_compact_by_size_(num_levels_), + compaction_score_(num_levels_), + compaction_level_(num_levels_), + l0_delay_trigger_count_(0), + accumulated_file_size_(0), + accumulated_raw_key_size_(0), + accumulated_raw_value_size_(0), + accumulated_num_non_deletions_(0), + accumulated_num_deletions_(0), + current_num_non_deletions_(0), + current_num_deletions_(0), + current_num_samples_(0), + estimated_compaction_needed_bytes_(0), + finalized_(false), + force_consistency_checks_(_force_consistency_checks) { + if (ref_vstorage != nullptr) { + accumulated_file_size_ = ref_vstorage->accumulated_file_size_; + accumulated_raw_key_size_ = ref_vstorage->accumulated_raw_key_size_; + accumulated_raw_value_size_ = ref_vstorage->accumulated_raw_value_size_; + accumulated_num_non_deletions_ = + ref_vstorage->accumulated_num_non_deletions_; + accumulated_num_deletions_ = ref_vstorage->accumulated_num_deletions_; + current_num_non_deletions_ = ref_vstorage->current_num_non_deletions_; + current_num_deletions_ = ref_vstorage->current_num_deletions_; + current_num_samples_ = ref_vstorage->current_num_samples_; + oldest_snapshot_seqnum_ = ref_vstorage->oldest_snapshot_seqnum_; + } +} + +Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset, + const FileOptions& file_opt, + const MutableCFOptions mutable_cf_options, + uint64_t version_number) + : env_(vset->env_), + cfd_(column_family_data), + info_log_((cfd_ == nullptr) ? nullptr : cfd_->ioptions()->info_log), + db_statistics_((cfd_ == nullptr) ? nullptr + : cfd_->ioptions()->statistics), + table_cache_((cfd_ == nullptr) ? nullptr : cfd_->table_cache()), + merge_operator_((cfd_ == nullptr) ? nullptr + : cfd_->ioptions()->merge_operator), + storage_info_( + (cfd_ == nullptr) ? nullptr : &cfd_->internal_comparator(), + (cfd_ == nullptr) ? nullptr : cfd_->user_comparator(), + cfd_ == nullptr ? 0 : cfd_->NumberLevels(), + cfd_ == nullptr ? kCompactionStyleLevel + : cfd_->ioptions()->compaction_style, + (cfd_ == nullptr || cfd_->current() == nullptr) + ? nullptr + : cfd_->current()->storage_info(), + cfd_ == nullptr ? false : cfd_->ioptions()->force_consistency_checks), + vset_(vset), + next_(this), + prev_(this), + refs_(0), + file_options_(file_opt), + mutable_cf_options_(mutable_cf_options), + version_number_(version_number) {} + +void Version::Get(const ReadOptions& read_options, const LookupKey& k, + PinnableSlice* value, Status* status, + MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, bool* value_found, + bool* key_exists, SequenceNumber* seq, ReadCallback* callback, + bool* is_blob, bool do_merge) { + Slice ikey = k.internal_key(); + Slice user_key = k.user_key(); + + assert(status->ok() || status->IsMergeInProgress()); + + if (key_exists != nullptr) { + // will falsify below if not found + *key_exists = true; + } + + PinnedIteratorsManager pinned_iters_mgr; + uint64_t tracing_get_id = BlockCacheTraceHelper::kReservedGetId; + if (vset_ && vset_->block_cache_tracer_ && + vset_->block_cache_tracer_->is_tracing_enabled()) { + tracing_get_id = vset_->block_cache_tracer_->NextGetId(); + } + GetContext get_context( + user_comparator(), merge_operator_, info_log_, db_statistics_, + status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key, + do_merge ? value : nullptr, value_found, merge_context, do_merge, + max_covering_tombstone_seq, this->env_, seq, + merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob, + tracing_get_id); + + // Pin blocks that we read to hold merge operands + if (merge_operator_) { + pinned_iters_mgr.StartPinning(); + } + + FilePicker fp( + storage_info_.files_, user_key, ikey, &storage_info_.level_files_brief_, + storage_info_.num_non_empty_levels_, &storage_info_.file_indexer_, + user_comparator(), internal_comparator()); + FdWithKeyRange* f = fp.GetNextFile(); + + while (f != nullptr) { + if (*max_covering_tombstone_seq > 0) { + // The remaining files we look at will only contain covered keys, so we + // stop here. + break; + } + if (get_context.sample()) { + sample_file_read_inc(f->file_metadata); + } + + bool timer_enabled = + GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex && + get_perf_context()->per_level_perf_context_enabled; + StopWatchNano timer(env_, timer_enabled /* auto_start */); + *status = table_cache_->Get( + read_options, *internal_comparator(), *f->file_metadata, ikey, + &get_context, mutable_cf_options_.prefix_extractor.get(), + cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()), + IsFilterSkipped(static_cast(fp.GetHitFileLevel()), + fp.IsHitFileLastInLevel()), + fp.GetCurrentLevel()); + // TODO: examine the behavior for corrupted key + if (timer_enabled) { + PERF_COUNTER_BY_LEVEL_ADD(get_from_table_nanos, timer.ElapsedNanos(), + fp.GetCurrentLevel()); + } + if (!status->ok()) { + return; + } + + // report the counters before returning + if (get_context.State() != GetContext::kNotFound && + get_context.State() != GetContext::kMerge && + db_statistics_ != nullptr) { + get_context.ReportCounters(); + } + switch (get_context.State()) { + case GetContext::kNotFound: + // Keep searching in other files + break; + case GetContext::kMerge: + // TODO: update per-level perfcontext user_key_return_count for kMerge + break; + case GetContext::kFound: + if (fp.GetHitFileLevel() == 0) { + RecordTick(db_statistics_, GET_HIT_L0); + } else if (fp.GetHitFileLevel() == 1) { + RecordTick(db_statistics_, GET_HIT_L1); + } else if (fp.GetHitFileLevel() >= 2) { + RecordTick(db_statistics_, GET_HIT_L2_AND_UP); + } + PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1, + fp.GetHitFileLevel()); + return; + case GetContext::kDeleted: + // Use empty error message for speed + *status = Status::NotFound(); + return; + case GetContext::kCorrupt: + *status = Status::Corruption("corrupted key for ", user_key); + return; + case GetContext::kBlobIndex: + ROCKS_LOG_ERROR(info_log_, "Encounter unexpected blob index."); + *status = Status::NotSupported( + "Encounter unexpected blob index. Please open DB with " + "ROCKSDB_NAMESPACE::blob_db::BlobDB instead."); + return; + } + f = fp.GetNextFile(); + } + if (db_statistics_ != nullptr) { + get_context.ReportCounters(); + } + if (GetContext::kMerge == get_context.State()) { + if (!do_merge) { + *status = Status::OK(); + return; + } + if (!merge_operator_) { + *status = Status::InvalidArgument( + "merge_operator is not properly initialized."); + return; + } + // merge_operands are in saver and we hit the beginning of the key history + // do a final merge of nullptr and operands; + std::string* str_value = value != nullptr ? value->GetSelf() : nullptr; + *status = MergeHelper::TimedFullMerge( + merge_operator_, user_key, nullptr, merge_context->GetOperands(), + str_value, info_log_, db_statistics_, env_, + nullptr /* result_operand */, true); + if (LIKELY(value != nullptr)) { + value->PinSelf(); + } + } else { + if (key_exists != nullptr) { + *key_exists = false; + } + *status = Status::NotFound(); // Use an empty error message for speed + } +} + +void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range, + ReadCallback* callback, bool* is_blob) { + PinnedIteratorsManager pinned_iters_mgr; + + // Pin blocks that we read to hold merge operands + if (merge_operator_) { + pinned_iters_mgr.StartPinning(); + } + uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId; + + if (vset_ && vset_->block_cache_tracer_ && + vset_->block_cache_tracer_->is_tracing_enabled()) { + tracing_mget_id = vset_->block_cache_tracer_->NextGetId(); + } + // Even though we know the batch size won't be > MAX_BATCH_SIZE, + // use autovector in order to avoid unnecessary construction of GetContext + // objects, which is expensive + autovector get_ctx; + for (auto iter = range->begin(); iter != range->end(); ++iter) { + assert(iter->s->ok() || iter->s->IsMergeInProgress()); + get_ctx.emplace_back( + user_comparator(), merge_operator_, info_log_, db_statistics_, + iter->s->ok() ? GetContext::kNotFound : GetContext::kMerge, iter->ukey, + iter->value, nullptr, &(iter->merge_context), true, + &iter->max_covering_tombstone_seq, this->env_, nullptr, + merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob, + tracing_mget_id); + // MergeInProgress status, if set, has been transferred to the get_context + // state, so we set status to ok here. From now on, the iter status will + // be used for IO errors, and get_context state will be used for any + // key level errors + *(iter->s) = Status::OK(); + } + int get_ctx_index = 0; + for (auto iter = range->begin(); iter != range->end(); + ++iter, get_ctx_index++) { + iter->get_context = &(get_ctx[get_ctx_index]); + } + + MultiGetRange file_picker_range(*range, range->begin(), range->end()); + FilePickerMultiGet fp( + &file_picker_range, + &storage_info_.level_files_brief_, storage_info_.num_non_empty_levels_, + &storage_info_.file_indexer_, user_comparator(), internal_comparator()); + FdWithKeyRange* f = fp.GetNextFile(); + + while (f != nullptr) { + MultiGetRange file_range = fp.CurrentFileRange(); + bool timer_enabled = + GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex && + get_perf_context()->per_level_perf_context_enabled; + StopWatchNano timer(env_, timer_enabled /* auto_start */); + Status s = table_cache_->MultiGet( + read_options, *internal_comparator(), *f->file_metadata, &file_range, + mutable_cf_options_.prefix_extractor.get(), + cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()), + IsFilterSkipped(static_cast(fp.GetHitFileLevel()), + fp.IsHitFileLastInLevel()), + fp.GetCurrentLevel()); + // TODO: examine the behavior for corrupted key + if (timer_enabled) { + PERF_COUNTER_BY_LEVEL_ADD(get_from_table_nanos, timer.ElapsedNanos(), + fp.GetCurrentLevel()); + } + if (!s.ok()) { + // TODO: Set status for individual keys appropriately + for (auto iter = file_range.begin(); iter != file_range.end(); ++iter) { + *iter->s = s; + file_range.MarkKeyDone(iter); + } + return; + } + uint64_t batch_size = 0; + for (auto iter = file_range.begin(); iter != file_range.end(); ++iter) { + GetContext& get_context = *iter->get_context; + Status* status = iter->s; + // The Status in the KeyContext takes precedence over GetContext state + // Status may be an error if there were any IO errors in the table + // reader. We never expect Status to be NotFound(), as that is + // determined by get_context + assert(!status->IsNotFound()); + if (!status->ok()) { + file_range.MarkKeyDone(iter); + continue; + } + + if (get_context.sample()) { + sample_file_read_inc(f->file_metadata); + } + batch_size++; + // report the counters before returning + if (get_context.State() != GetContext::kNotFound && + get_context.State() != GetContext::kMerge && + db_statistics_ != nullptr) { + get_context.ReportCounters(); + } else { + if (iter->max_covering_tombstone_seq > 0) { + // The remaining files we look at will only contain covered keys, so + // we stop here for this key + file_picker_range.SkipKey(iter); + } + } + switch (get_context.State()) { + case GetContext::kNotFound: + // Keep searching in other files + break; + case GetContext::kMerge: + // TODO: update per-level perfcontext user_key_return_count for kMerge + break; + case GetContext::kFound: + if (fp.GetHitFileLevel() == 0) { + RecordTick(db_statistics_, GET_HIT_L0); + } else if (fp.GetHitFileLevel() == 1) { + RecordTick(db_statistics_, GET_HIT_L1); + } else if (fp.GetHitFileLevel() >= 2) { + RecordTick(db_statistics_, GET_HIT_L2_AND_UP); + } + PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1, + fp.GetHitFileLevel()); + file_range.MarkKeyDone(iter); + continue; + case GetContext::kDeleted: + // Use empty error message for speed + *status = Status::NotFound(); + file_range.MarkKeyDone(iter); + continue; + case GetContext::kCorrupt: + *status = + Status::Corruption("corrupted key for ", iter->lkey->user_key()); + file_range.MarkKeyDone(iter); + continue; + case GetContext::kBlobIndex: + ROCKS_LOG_ERROR(info_log_, "Encounter unexpected blob index."); + *status = Status::NotSupported( + "Encounter unexpected blob index. Please open DB with " + "ROCKSDB_NAMESPACE::blob_db::BlobDB instead."); + file_range.MarkKeyDone(iter); + continue; + } + } + RecordInHistogram(db_statistics_, SST_BATCH_SIZE, batch_size); + if (file_picker_range.empty()) { + break; + } + f = fp.GetNextFile(); + } + + // Process any left over keys + for (auto iter = range->begin(); iter != range->end(); ++iter) { + GetContext& get_context = *iter->get_context; + Status* status = iter->s; + Slice user_key = iter->lkey->user_key(); + + if (db_statistics_ != nullptr) { + get_context.ReportCounters(); + } + if (GetContext::kMerge == get_context.State()) { + if (!merge_operator_) { + *status = Status::InvalidArgument( + "merge_operator is not properly initialized."); + range->MarkKeyDone(iter); + continue; + } + // merge_operands are in saver and we hit the beginning of the key history + // do a final merge of nullptr and operands; + std::string* str_value = + iter->value != nullptr ? iter->value->GetSelf() : nullptr; + *status = MergeHelper::TimedFullMerge( + merge_operator_, user_key, nullptr, iter->merge_context.GetOperands(), + str_value, info_log_, db_statistics_, env_, + nullptr /* result_operand */, true); + if (LIKELY(iter->value != nullptr)) { + iter->value->PinSelf(); + } + } else { + range->MarkKeyDone(iter); + *status = Status::NotFound(); // Use an empty error message for speed + } + } +} + +bool Version::IsFilterSkipped(int level, bool is_file_last_in_level) { + // Reaching the bottom level implies misses at all upper levels, so we'll + // skip checking the filters when we predict a hit. + return cfd_->ioptions()->optimize_filters_for_hits && + (level > 0 || is_file_last_in_level) && + level == storage_info_.num_non_empty_levels() - 1; +} + +void VersionStorageInfo::GenerateLevelFilesBrief() { + level_files_brief_.resize(num_non_empty_levels_); + for (int level = 0; level < num_non_empty_levels_; level++) { + DoGenerateLevelFilesBrief( + &level_files_brief_[level], files_[level], &arena_); + } +} + +void Version::PrepareApply( + const MutableCFOptions& mutable_cf_options, + bool update_stats) { + UpdateAccumulatedStats(update_stats); + storage_info_.UpdateNumNonEmptyLevels(); + storage_info_.CalculateBaseBytes(*cfd_->ioptions(), mutable_cf_options); + storage_info_.UpdateFilesByCompactionPri(cfd_->ioptions()->compaction_pri); + storage_info_.GenerateFileIndexer(); + storage_info_.GenerateLevelFilesBrief(); + storage_info_.GenerateLevel0NonOverlapping(); + storage_info_.GenerateBottommostFiles(); +} + +bool Version::MaybeInitializeFileMetaData(FileMetaData* file_meta) { + if (file_meta->init_stats_from_file || + file_meta->compensated_file_size > 0) { + return false; + } + std::shared_ptr tp; + Status s = GetTableProperties(&tp, file_meta); + file_meta->init_stats_from_file = true; + if (!s.ok()) { + ROCKS_LOG_ERROR(vset_->db_options_->info_log, + "Unable to load table properties for file %" PRIu64 + " --- %s\n", + file_meta->fd.GetNumber(), s.ToString().c_str()); + return false; + } + if (tp.get() == nullptr) return false; + file_meta->num_entries = tp->num_entries; + file_meta->num_deletions = tp->num_deletions; + file_meta->raw_value_size = tp->raw_value_size; + file_meta->raw_key_size = tp->raw_key_size; + + return true; +} + +void VersionStorageInfo::UpdateAccumulatedStats(FileMetaData* file_meta) { + TEST_SYNC_POINT_CALLBACK("VersionStorageInfo::UpdateAccumulatedStats", + nullptr); + + assert(file_meta->init_stats_from_file); + accumulated_file_size_ += file_meta->fd.GetFileSize(); + accumulated_raw_key_size_ += file_meta->raw_key_size; + accumulated_raw_value_size_ += file_meta->raw_value_size; + accumulated_num_non_deletions_ += + file_meta->num_entries - file_meta->num_deletions; + accumulated_num_deletions_ += file_meta->num_deletions; + + current_num_non_deletions_ += + file_meta->num_entries - file_meta->num_deletions; + current_num_deletions_ += file_meta->num_deletions; + current_num_samples_++; +} + +void VersionStorageInfo::RemoveCurrentStats(FileMetaData* file_meta) { + if (file_meta->init_stats_from_file) { + current_num_non_deletions_ -= + file_meta->num_entries - file_meta->num_deletions; + current_num_deletions_ -= file_meta->num_deletions; + current_num_samples_--; + } +} + +void Version::UpdateAccumulatedStats(bool update_stats) { + if (update_stats) { + // maximum number of table properties loaded from files. + const int kMaxInitCount = 20; + int init_count = 0; + // here only the first kMaxInitCount files which haven't been + // initialized from file will be updated with num_deletions. + // The motivation here is to cap the maximum I/O per Version creation. + // The reason for choosing files from lower-level instead of higher-level + // is that such design is able to propagate the initialization from + // lower-level to higher-level: When the num_deletions of lower-level + // files are updated, it will make the lower-level files have accurate + // compensated_file_size, making lower-level to higher-level compaction + // will be triggered, which creates higher-level files whose num_deletions + // will be updated here. + for (int level = 0; + level < storage_info_.num_levels_ && init_count < kMaxInitCount; + ++level) { + for (auto* file_meta : storage_info_.files_[level]) { + if (MaybeInitializeFileMetaData(file_meta)) { + // each FileMeta will be initialized only once. + storage_info_.UpdateAccumulatedStats(file_meta); + // when option "max_open_files" is -1, all the file metadata has + // already been read, so MaybeInitializeFileMetaData() won't incur + // any I/O cost. "max_open_files=-1" means that the table cache passed + // to the VersionSet and then to the ColumnFamilySet has a size of + // TableCache::kInfiniteCapacity + if (vset_->GetColumnFamilySet()->get_table_cache()->GetCapacity() == + TableCache::kInfiniteCapacity) { + continue; + } + if (++init_count >= kMaxInitCount) { + break; + } + } + } + } + // In case all sampled-files contain only deletion entries, then we + // load the table-property of a file in higher-level to initialize + // that value. + for (int level = storage_info_.num_levels_ - 1; + storage_info_.accumulated_raw_value_size_ == 0 && level >= 0; + --level) { + for (int i = static_cast(storage_info_.files_[level].size()) - 1; + storage_info_.accumulated_raw_value_size_ == 0 && i >= 0; --i) { + if (MaybeInitializeFileMetaData(storage_info_.files_[level][i])) { + storage_info_.UpdateAccumulatedStats(storage_info_.files_[level][i]); + } + } + } + } + + storage_info_.ComputeCompensatedSizes(); +} + +void VersionStorageInfo::ComputeCompensatedSizes() { + static const int kDeletionWeightOnCompaction = 2; + uint64_t average_value_size = GetAverageValueSize(); + + // compute the compensated size + for (int level = 0; level < num_levels_; level++) { + for (auto* file_meta : files_[level]) { + // Here we only compute compensated_file_size for those file_meta + // which compensated_file_size is uninitialized (== 0). This is true only + // for files that have been created right now and no other thread has + // access to them. That's why we can safely mutate compensated_file_size. + if (file_meta->compensated_file_size == 0) { + file_meta->compensated_file_size = file_meta->fd.GetFileSize(); + // Here we only boost the size of deletion entries of a file only + // when the number of deletion entries is greater than the number of + // non-deletion entries in the file. The motivation here is that in + // a stable workload, the number of deletion entries should be roughly + // equal to the number of non-deletion entries. If we compensate the + // size of deletion entries in a stable workload, the deletion + // compensation logic might introduce unwanted effet which changes the + // shape of LSM tree. + if (file_meta->num_deletions * 2 >= file_meta->num_entries) { + file_meta->compensated_file_size += + (file_meta->num_deletions * 2 - file_meta->num_entries) * + average_value_size * kDeletionWeightOnCompaction; + } + } + } + } +} + +int VersionStorageInfo::MaxInputLevel() const { + if (compaction_style_ == kCompactionStyleLevel) { + return num_levels() - 2; + } + return 0; +} + +int VersionStorageInfo::MaxOutputLevel(bool allow_ingest_behind) const { + if (allow_ingest_behind) { + assert(num_levels() > 1); + return num_levels() - 2; + } + return num_levels() - 1; +} + +void VersionStorageInfo::EstimateCompactionBytesNeeded( + const MutableCFOptions& mutable_cf_options) { + // Only implemented for level-based compaction + if (compaction_style_ != kCompactionStyleLevel) { + estimated_compaction_needed_bytes_ = 0; + return; + } + + // Start from Level 0, if level 0 qualifies compaction to level 1, + // we estimate the size of compaction. + // Then we move on to the next level and see whether it qualifies compaction + // to the next level. The size of the level is estimated as the actual size + // on the level plus the input bytes from the previous level if there is any. + // If it exceeds, take the exceeded bytes as compaction input and add the size + // of the compaction size to tatal size. + // We keep doing it to Level 2, 3, etc, until the last level and return the + // accumulated bytes. + + uint64_t bytes_compact_to_next_level = 0; + uint64_t level_size = 0; + for (auto* f : files_[0]) { + level_size += f->fd.GetFileSize(); + } + // Level 0 + bool level0_compact_triggered = false; + if (static_cast(files_[0].size()) >= + mutable_cf_options.level0_file_num_compaction_trigger || + level_size >= mutable_cf_options.max_bytes_for_level_base) { + level0_compact_triggered = true; + estimated_compaction_needed_bytes_ = level_size; + bytes_compact_to_next_level = level_size; + } else { + estimated_compaction_needed_bytes_ = 0; + } + + // Level 1 and up. + uint64_t bytes_next_level = 0; + for (int level = base_level(); level <= MaxInputLevel(); level++) { + level_size = 0; + if (bytes_next_level > 0) { +#ifndef NDEBUG + uint64_t level_size2 = 0; + for (auto* f : files_[level]) { + level_size2 += f->fd.GetFileSize(); + } + assert(level_size2 == bytes_next_level); +#endif + level_size = bytes_next_level; + bytes_next_level = 0; + } else { + for (auto* f : files_[level]) { + level_size += f->fd.GetFileSize(); + } + } + if (level == base_level() && level0_compact_triggered) { + // Add base level size to compaction if level0 compaction triggered. + estimated_compaction_needed_bytes_ += level_size; + } + // Add size added by previous compaction + level_size += bytes_compact_to_next_level; + bytes_compact_to_next_level = 0; + uint64_t level_target = MaxBytesForLevel(level); + if (level_size > level_target) { + bytes_compact_to_next_level = level_size - level_target; + // Estimate the actual compaction fan-out ratio as size ratio between + // the two levels. + + assert(bytes_next_level == 0); + if (level + 1 < num_levels_) { + for (auto* f : files_[level + 1]) { + bytes_next_level += f->fd.GetFileSize(); + } + } + if (bytes_next_level > 0) { + assert(level_size > 0); + estimated_compaction_needed_bytes_ += static_cast( + static_cast(bytes_compact_to_next_level) * + (static_cast(bytes_next_level) / + static_cast(level_size) + + 1)); + } + } + } +} + +namespace { +uint32_t GetExpiredTtlFilesCount(const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options, + const std::vector& files) { + uint32_t ttl_expired_files_count = 0; + + int64_t _current_time; + auto status = ioptions.env->GetCurrentTime(&_current_time); + if (status.ok()) { + const uint64_t current_time = static_cast(_current_time); + for (FileMetaData* f : files) { + if (!f->being_compacted) { + uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime(); + if (oldest_ancester_time != 0 && + oldest_ancester_time < (current_time - mutable_cf_options.ttl)) { + ttl_expired_files_count++; + } + } + } + } + return ttl_expired_files_count; +} +} // anonymous namespace + +void VersionStorageInfo::ComputeCompactionScore( + const ImmutableCFOptions& immutable_cf_options, + const MutableCFOptions& mutable_cf_options) { + for (int level = 0; level <= MaxInputLevel(); level++) { + double score; + if (level == 0) { + // We treat level-0 specially by bounding the number of files + // instead of number of bytes for two reasons: + // + // (1) With larger write-buffer sizes, it is nice not to do too + // many level-0 compactions. + // + // (2) The files in level-0 are merged on every read and + // therefore we wish to avoid too many files when the individual + // file size is small (perhaps because of a small write-buffer + // setting, or very high compression ratios, or lots of + // overwrites/deletions). + int num_sorted_runs = 0; + uint64_t total_size = 0; + for (auto* f : files_[level]) { + if (!f->being_compacted) { + total_size += f->compensated_file_size; + num_sorted_runs++; + } + } + if (compaction_style_ == kCompactionStyleUniversal) { + // For universal compaction, we use level0 score to indicate + // compaction score for the whole DB. Adding other levels as if + // they are L0 files. + for (int i = 1; i < num_levels(); i++) { + if (!files_[i].empty() && !files_[i][0]->being_compacted) { + num_sorted_runs++; + } + } + } + + if (compaction_style_ == kCompactionStyleFIFO) { + score = static_cast(total_size) / + mutable_cf_options.compaction_options_fifo.max_table_files_size; + if (mutable_cf_options.compaction_options_fifo.allow_compaction) { + score = std::max( + static_cast(num_sorted_runs) / + mutable_cf_options.level0_file_num_compaction_trigger, + score); + } + if (mutable_cf_options.ttl > 0) { + score = std::max( + static_cast(GetExpiredTtlFilesCount( + immutable_cf_options, mutable_cf_options, files_[level])), + score); + } + + } else { + score = static_cast(num_sorted_runs) / + mutable_cf_options.level0_file_num_compaction_trigger; + if (compaction_style_ == kCompactionStyleLevel && num_levels() > 1) { + // Level-based involves L0->L0 compactions that can lead to oversized + // L0 files. Take into account size as well to avoid later giant + // compactions to the base level. + score = std::max( + score, static_cast(total_size) / + mutable_cf_options.max_bytes_for_level_base); + } + } + } else { + // Compute the ratio of current size to size limit. + uint64_t level_bytes_no_compacting = 0; + for (auto f : files_[level]) { + if (!f->being_compacted) { + level_bytes_no_compacting += f->compensated_file_size; + } + } + score = static_cast(level_bytes_no_compacting) / + MaxBytesForLevel(level); + } + compaction_level_[level] = level; + compaction_score_[level] = score; + } + + // sort all the levels based on their score. Higher scores get listed + // first. Use bubble sort because the number of entries are small. + for (int i = 0; i < num_levels() - 2; i++) { + for (int j = i + 1; j < num_levels() - 1; j++) { + if (compaction_score_[i] < compaction_score_[j]) { + double score = compaction_score_[i]; + int level = compaction_level_[i]; + compaction_score_[i] = compaction_score_[j]; + compaction_level_[i] = compaction_level_[j]; + compaction_score_[j] = score; + compaction_level_[j] = level; + } + } + } + ComputeFilesMarkedForCompaction(); + ComputeBottommostFilesMarkedForCompaction(); + if (mutable_cf_options.ttl > 0) { + ComputeExpiredTtlFiles(immutable_cf_options, mutable_cf_options.ttl); + } + if (mutable_cf_options.periodic_compaction_seconds > 0) { + ComputeFilesMarkedForPeriodicCompaction( + immutable_cf_options, mutable_cf_options.periodic_compaction_seconds); + } + EstimateCompactionBytesNeeded(mutable_cf_options); +} + +void VersionStorageInfo::ComputeFilesMarkedForCompaction() { + files_marked_for_compaction_.clear(); + int last_qualify_level = 0; + + // Do not include files from the last level with data + // If table properties collector suggests a file on the last level, + // we should not move it to a new level. + for (int level = num_levels() - 1; level >= 1; level--) { + if (!files_[level].empty()) { + last_qualify_level = level - 1; + break; + } + } + + for (int level = 0; level <= last_qualify_level; level++) { + for (auto* f : files_[level]) { + if (!f->being_compacted && f->marked_for_compaction) { + files_marked_for_compaction_.emplace_back(level, f); + } + } + } +} + +void VersionStorageInfo::ComputeExpiredTtlFiles( + const ImmutableCFOptions& ioptions, const uint64_t ttl) { + assert(ttl > 0); + + expired_ttl_files_.clear(); + + int64_t _current_time; + auto status = ioptions.env->GetCurrentTime(&_current_time); + if (!status.ok()) { + return; + } + const uint64_t current_time = static_cast(_current_time); + + for (int level = 0; level < num_levels() - 1; level++) { + for (FileMetaData* f : files_[level]) { + if (!f->being_compacted) { + uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime(); + if (oldest_ancester_time > 0 && + oldest_ancester_time < (current_time - ttl)) { + expired_ttl_files_.emplace_back(level, f); + } + } + } + } +} + +void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction( + const ImmutableCFOptions& ioptions, + const uint64_t periodic_compaction_seconds) { + assert(periodic_compaction_seconds > 0); + + files_marked_for_periodic_compaction_.clear(); + + int64_t temp_current_time; + auto status = ioptions.env->GetCurrentTime(&temp_current_time); + if (!status.ok()) { + return; + } + const uint64_t current_time = static_cast(temp_current_time); + + // If periodic_compaction_seconds is larger than current time, periodic + // compaction can't possibly be triggered. + if (periodic_compaction_seconds > current_time) { + return; + } + + const uint64_t allowed_time_limit = + current_time - periodic_compaction_seconds; + + for (int level = 0; level < num_levels(); level++) { + for (auto f : files_[level]) { + if (!f->being_compacted) { + // Compute a file's modification time in the following order: + // 1. Use file_creation_time table property if it is > 0. + // 2. Use creation_time table property if it is > 0. + // 3. Use file's mtime metadata if the above two table properties are 0. + // Don't consider the file at all if the modification time cannot be + // correctly determined based on the above conditions. + uint64_t file_modification_time = f->TryGetFileCreationTime(); + if (file_modification_time == kUnknownFileCreationTime) { + file_modification_time = f->TryGetOldestAncesterTime(); + } + if (file_modification_time == kUnknownOldestAncesterTime) { + auto file_path = TableFileName(ioptions.cf_paths, f->fd.GetNumber(), + f->fd.GetPathId()); + status = ioptions.env->GetFileModificationTime( + file_path, &file_modification_time); + if (!status.ok()) { + ROCKS_LOG_WARN(ioptions.info_log, + "Can't get file modification time: %s: %s", + file_path.c_str(), status.ToString().c_str()); + continue; + } + } + if (file_modification_time > 0 && + file_modification_time < allowed_time_limit) { + files_marked_for_periodic_compaction_.emplace_back(level, f); + } + } + } + } +} + +namespace { + +// used to sort files by size +struct Fsize { + size_t index; + FileMetaData* file; +}; + +// Compator that is used to sort files based on their size +// In normal mode: descending size +bool CompareCompensatedSizeDescending(const Fsize& first, const Fsize& second) { + return (first.file->compensated_file_size > + second.file->compensated_file_size); +} +} // anonymous namespace + +void VersionStorageInfo::AddFile(int level, FileMetaData* f, Logger* info_log) { + auto* level_files = &files_[level]; + // Must not overlap +#ifndef NDEBUG + if (level > 0 && !level_files->empty() && + internal_comparator_->Compare( + (*level_files)[level_files->size() - 1]->largest, f->smallest) >= 0) { + auto* f2 = (*level_files)[level_files->size() - 1]; + if (info_log != nullptr) { + Error(info_log, "Adding new file %" PRIu64 + " range (%s, %s) to level %d but overlapping " + "with existing file %" PRIu64 " %s %s", + f->fd.GetNumber(), f->smallest.DebugString(true).c_str(), + f->largest.DebugString(true).c_str(), level, f2->fd.GetNumber(), + f2->smallest.DebugString(true).c_str(), + f2->largest.DebugString(true).c_str()); + LogFlush(info_log); + } + assert(false); + } +#else + (void)info_log; +#endif + f->refs++; + level_files->push_back(f); +} + +// Version::PrepareApply() need to be called before calling the function, or +// following functions called: +// 1. UpdateNumNonEmptyLevels(); +// 2. CalculateBaseBytes(); +// 3. UpdateFilesByCompactionPri(); +// 4. GenerateFileIndexer(); +// 5. GenerateLevelFilesBrief(); +// 6. GenerateLevel0NonOverlapping(); +// 7. GenerateBottommostFiles(); +void VersionStorageInfo::SetFinalized() { + finalized_ = true; +#ifndef NDEBUG + if (compaction_style_ != kCompactionStyleLevel) { + // Not level based compaction. + return; + } + assert(base_level_ < 0 || num_levels() == 1 || + (base_level_ >= 1 && base_level_ < num_levels())); + // Verify all levels newer than base_level are empty except L0 + for (int level = 1; level < base_level(); level++) { + assert(NumLevelBytes(level) == 0); + } + uint64_t max_bytes_prev_level = 0; + for (int level = base_level(); level < num_levels() - 1; level++) { + if (LevelFiles(level).size() == 0) { + continue; + } + assert(MaxBytesForLevel(level) >= max_bytes_prev_level); + max_bytes_prev_level = MaxBytesForLevel(level); + } + int num_empty_non_l0_level = 0; + for (int level = 0; level < num_levels(); level++) { + assert(LevelFiles(level).size() == 0 || + LevelFiles(level).size() == LevelFilesBrief(level).num_files); + if (level > 0 && NumLevelBytes(level) > 0) { + num_empty_non_l0_level++; + } + if (LevelFiles(level).size() > 0) { + assert(level < num_non_empty_levels()); + } + } + assert(compaction_level_.size() > 0); + assert(compaction_level_.size() == compaction_score_.size()); +#endif +} + +void VersionStorageInfo::UpdateNumNonEmptyLevels() { + num_non_empty_levels_ = num_levels_; + for (int i = num_levels_ - 1; i >= 0; i--) { + if (files_[i].size() != 0) { + return; + } else { + num_non_empty_levels_ = i; + } + } +} + +namespace { +// Sort `temp` based on ratio of overlapping size over file size +void SortFileByOverlappingRatio( + const InternalKeyComparator& icmp, const std::vector& files, + const std::vector& next_level_files, + std::vector* temp) { + std::unordered_map file_to_order; + auto next_level_it = next_level_files.begin(); + + for (auto& file : files) { + uint64_t overlapping_bytes = 0; + // Skip files in next level that is smaller than current file + while (next_level_it != next_level_files.end() && + icmp.Compare((*next_level_it)->largest, file->smallest) < 0) { + next_level_it++; + } + + while (next_level_it != next_level_files.end() && + icmp.Compare((*next_level_it)->smallest, file->largest) < 0) { + overlapping_bytes += (*next_level_it)->fd.file_size; + + if (icmp.Compare((*next_level_it)->largest, file->largest) > 0) { + // next level file cross large boundary of current file. + break; + } + next_level_it++; + } + + assert(file->compensated_file_size != 0); + file_to_order[file->fd.GetNumber()] = + overlapping_bytes * 1024u / file->compensated_file_size; + } + + std::sort(temp->begin(), temp->end(), + [&](const Fsize& f1, const Fsize& f2) -> bool { + return file_to_order[f1.file->fd.GetNumber()] < + file_to_order[f2.file->fd.GetNumber()]; + }); +} +} // namespace + +void VersionStorageInfo::UpdateFilesByCompactionPri( + CompactionPri compaction_pri) { + if (compaction_style_ == kCompactionStyleNone || + compaction_style_ == kCompactionStyleFIFO || + compaction_style_ == kCompactionStyleUniversal) { + // don't need this + return; + } + // No need to sort the highest level because it is never compacted. + for (int level = 0; level < num_levels() - 1; level++) { + const std::vector& files = files_[level]; + auto& files_by_compaction_pri = files_by_compaction_pri_[level]; + assert(files_by_compaction_pri.size() == 0); + + // populate a temp vector for sorting based on size + std::vector temp(files.size()); + for (size_t i = 0; i < files.size(); i++) { + temp[i].index = i; + temp[i].file = files[i]; + } + + // sort the top number_of_files_to_sort_ based on file size + size_t num = VersionStorageInfo::kNumberFilesToSort; + if (num > temp.size()) { + num = temp.size(); + } + switch (compaction_pri) { + case kByCompensatedSize: + std::partial_sort(temp.begin(), temp.begin() + num, temp.end(), + CompareCompensatedSizeDescending); + break; + case kOldestLargestSeqFirst: + std::sort(temp.begin(), temp.end(), + [](const Fsize& f1, const Fsize& f2) -> bool { + return f1.file->fd.largest_seqno < + f2.file->fd.largest_seqno; + }); + break; + case kOldestSmallestSeqFirst: + std::sort(temp.begin(), temp.end(), + [](const Fsize& f1, const Fsize& f2) -> bool { + return f1.file->fd.smallest_seqno < + f2.file->fd.smallest_seqno; + }); + break; + case kMinOverlappingRatio: + SortFileByOverlappingRatio(*internal_comparator_, files_[level], + files_[level + 1], &temp); + break; + default: + assert(false); + } + assert(temp.size() == files.size()); + + // initialize files_by_compaction_pri_ + for (size_t i = 0; i < temp.size(); i++) { + files_by_compaction_pri.push_back(static_cast(temp[i].index)); + } + next_file_to_compact_by_size_[level] = 0; + assert(files_[level].size() == files_by_compaction_pri_[level].size()); + } +} + +void VersionStorageInfo::GenerateLevel0NonOverlapping() { + assert(!finalized_); + level0_non_overlapping_ = true; + if (level_files_brief_.size() == 0) { + return; + } + + // A copy of L0 files sorted by smallest key + std::vector level0_sorted_file( + level_files_brief_[0].files, + level_files_brief_[0].files + level_files_brief_[0].num_files); + std::sort(level0_sorted_file.begin(), level0_sorted_file.end(), + [this](const FdWithKeyRange& f1, const FdWithKeyRange& f2) -> bool { + return (internal_comparator_->Compare(f1.smallest_key, + f2.smallest_key) < 0); + }); + + for (size_t i = 1; i < level0_sorted_file.size(); ++i) { + FdWithKeyRange& f = level0_sorted_file[i]; + FdWithKeyRange& prev = level0_sorted_file[i - 1]; + if (internal_comparator_->Compare(prev.largest_key, f.smallest_key) >= 0) { + level0_non_overlapping_ = false; + break; + } + } +} + +void VersionStorageInfo::GenerateBottommostFiles() { + assert(!finalized_); + assert(bottommost_files_.empty()); + for (size_t level = 0; level < level_files_brief_.size(); ++level) { + for (size_t file_idx = 0; file_idx < level_files_brief_[level].num_files; + ++file_idx) { + const FdWithKeyRange& f = level_files_brief_[level].files[file_idx]; + int l0_file_idx; + if (level == 0) { + l0_file_idx = static_cast(file_idx); + } else { + l0_file_idx = -1; + } + Slice smallest_user_key = ExtractUserKey(f.smallest_key); + Slice largest_user_key = ExtractUserKey(f.largest_key); + if (!RangeMightExistAfterSortedRun(smallest_user_key, largest_user_key, + static_cast(level), + l0_file_idx)) { + bottommost_files_.emplace_back(static_cast(level), + f.file_metadata); + } + } + } +} + +void VersionStorageInfo::UpdateOldestSnapshot(SequenceNumber seqnum) { + assert(seqnum >= oldest_snapshot_seqnum_); + oldest_snapshot_seqnum_ = seqnum; + if (oldest_snapshot_seqnum_ > bottommost_files_mark_threshold_) { + ComputeBottommostFilesMarkedForCompaction(); + } +} + +void VersionStorageInfo::ComputeBottommostFilesMarkedForCompaction() { + bottommost_files_marked_for_compaction_.clear(); + bottommost_files_mark_threshold_ = kMaxSequenceNumber; + for (auto& level_and_file : bottommost_files_) { + if (!level_and_file.second->being_compacted && + level_and_file.second->fd.largest_seqno != 0 && + level_and_file.second->num_deletions > 1) { + // largest_seqno might be nonzero due to containing the final key in an + // earlier compaction, whose seqnum we didn't zero out. Multiple deletions + // ensures the file really contains deleted or overwritten keys. + if (level_and_file.second->fd.largest_seqno < oldest_snapshot_seqnum_) { + bottommost_files_marked_for_compaction_.push_back(level_and_file); + } else { + bottommost_files_mark_threshold_ = + std::min(bottommost_files_mark_threshold_, + level_and_file.second->fd.largest_seqno); + } + } + } +} + +void Version::Ref() { + ++refs_; +} + +bool Version::Unref() { + assert(refs_ >= 1); + --refs_; + if (refs_ == 0) { + delete this; + return true; + } + return false; +} + +bool VersionStorageInfo::OverlapInLevel(int level, + const Slice* smallest_user_key, + const Slice* largest_user_key) { + if (level >= num_non_empty_levels_) { + // empty level, no overlap + return false; + } + return SomeFileOverlapsRange(*internal_comparator_, (level > 0), + level_files_brief_[level], smallest_user_key, + largest_user_key); +} + +// Store in "*inputs" all files in "level" that overlap [begin,end] +// If hint_index is specified, then it points to a file in the +// overlapping range. +// The file_index returns a pointer to any file in an overlapping range. +void VersionStorageInfo::GetOverlappingInputs( + int level, const InternalKey* begin, const InternalKey* end, + std::vector* inputs, int hint_index, int* file_index, + bool expand_range, InternalKey** next_smallest) const { + if (level >= num_non_empty_levels_) { + // this level is empty, no overlapping inputs + return; + } + + inputs->clear(); + if (file_index) { + *file_index = -1; + } + const Comparator* user_cmp = user_comparator_; + if (level > 0) { + GetOverlappingInputsRangeBinarySearch(level, begin, end, inputs, hint_index, + file_index, false, next_smallest); + return; + } + + if (next_smallest) { + // next_smallest key only makes sense for non-level 0, where files are + // non-overlapping + *next_smallest = nullptr; + } + + Slice user_begin, user_end; + if (begin != nullptr) { + user_begin = begin->user_key(); + } + if (end != nullptr) { + user_end = end->user_key(); + } + + // index stores the file index need to check. + std::list index; + for (size_t i = 0; i < level_files_brief_[level].num_files; i++) { + index.emplace_back(i); + } + + while (!index.empty()) { + bool found_overlapping_file = false; + auto iter = index.begin(); + while (iter != index.end()) { + FdWithKeyRange* f = &(level_files_brief_[level].files[*iter]); + const Slice file_start = ExtractUserKey(f->smallest_key); + const Slice file_limit = ExtractUserKey(f->largest_key); + if (begin != nullptr && + user_cmp->CompareWithoutTimestamp(file_limit, user_begin) < 0) { + // "f" is completely before specified range; skip it + iter++; + } else if (end != nullptr && + user_cmp->CompareWithoutTimestamp(file_start, user_end) > 0) { + // "f" is completely after specified range; skip it + iter++; + } else { + // if overlap + inputs->emplace_back(files_[level][*iter]); + found_overlapping_file = true; + // record the first file index. + if (file_index && *file_index == -1) { + *file_index = static_cast(*iter); + } + // the related file is overlap, erase to avoid checking again. + iter = index.erase(iter); + if (expand_range) { + if (begin != nullptr && + user_cmp->CompareWithoutTimestamp(file_start, user_begin) < 0) { + user_begin = file_start; + } + if (end != nullptr && + user_cmp->CompareWithoutTimestamp(file_limit, user_end) > 0) { + user_end = file_limit; + } + } + } + } + // if all the files left are not overlap, break + if (!found_overlapping_file) { + break; + } + } +} + +// Store in "*inputs" files in "level" that within range [begin,end] +// Guarantee a "clean cut" boundary between the files in inputs +// and the surrounding files and the maxinum number of files. +// This will ensure that no parts of a key are lost during compaction. +// If hint_index is specified, then it points to a file in the range. +// The file_index returns a pointer to any file in an overlapping range. +void VersionStorageInfo::GetCleanInputsWithinInterval( + int level, const InternalKey* begin, const InternalKey* end, + std::vector* inputs, int hint_index, int* file_index) const { + inputs->clear(); + if (file_index) { + *file_index = -1; + } + if (level >= num_non_empty_levels_ || level == 0 || + level_files_brief_[level].num_files == 0) { + // this level is empty, no inputs within range + // also don't support clean input interval within L0 + return; + } + + GetOverlappingInputsRangeBinarySearch(level, begin, end, inputs, + hint_index, file_index, + true /* within_interval */); +} + +// Store in "*inputs" all files in "level" that overlap [begin,end] +// Employ binary search to find at least one file that overlaps the +// specified range. From that file, iterate backwards and +// forwards to find all overlapping files. +// if within_range is set, then only store the maximum clean inputs +// within range [begin, end]. "clean" means there is a boudnary +// between the files in "*inputs" and the surrounding files +void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch( + int level, const InternalKey* begin, const InternalKey* end, + std::vector* inputs, int hint_index, int* file_index, + bool within_interval, InternalKey** next_smallest) const { + assert(level > 0); + + auto user_cmp = user_comparator_; + const FdWithKeyRange* files = level_files_brief_[level].files; + const int num_files = static_cast(level_files_brief_[level].num_files); + + // begin to use binary search to find lower bound + // and upper bound. + int start_index = 0; + int end_index = num_files; + + if (begin != nullptr) { + // if within_interval is true, with file_key would find + // not overlapping ranges in std::lower_bound. + auto cmp = [&user_cmp, &within_interval](const FdWithKeyRange& f, + const InternalKey* k) { + auto& file_key = within_interval ? f.file_metadata->smallest + : f.file_metadata->largest; + return sstableKeyCompare(user_cmp, file_key, *k) < 0; + }; + + start_index = static_cast( + std::lower_bound(files, + files + (hint_index == -1 ? num_files : hint_index), + begin, cmp) - + files); + + if (start_index > 0 && within_interval) { + bool is_overlapping = true; + while (is_overlapping && start_index < num_files) { + auto& pre_limit = files[start_index - 1].file_metadata->largest; + auto& cur_start = files[start_index].file_metadata->smallest; + is_overlapping = sstableKeyCompare(user_cmp, pre_limit, cur_start) == 0; + start_index += is_overlapping; + } + } + } + + if (end != nullptr) { + // if within_interval is true, with file_key would find + // not overlapping ranges in std::upper_bound. + auto cmp = [&user_cmp, &within_interval](const InternalKey* k, + const FdWithKeyRange& f) { + auto& file_key = within_interval ? f.file_metadata->largest + : f.file_metadata->smallest; + return sstableKeyCompare(user_cmp, *k, file_key) < 0; + }; + + end_index = static_cast( + std::upper_bound(files + start_index, files + num_files, end, cmp) - + files); + + if (end_index < num_files && within_interval) { + bool is_overlapping = true; + while (is_overlapping && end_index > start_index) { + auto& next_start = files[end_index].file_metadata->smallest; + auto& cur_limit = files[end_index - 1].file_metadata->largest; + is_overlapping = + sstableKeyCompare(user_cmp, cur_limit, next_start) == 0; + end_index -= is_overlapping; + } + } + } + + assert(start_index <= end_index); + + // If there were no overlapping files, return immediately. + if (start_index == end_index) { + if (next_smallest) { + *next_smallest = nullptr; + } + return; + } + + assert(start_index < end_index); + + // returns the index where an overlap is found + if (file_index) { + *file_index = start_index; + } + + // insert overlapping files into vector + for (int i = start_index; i < end_index; i++) { + inputs->push_back(files_[level][i]); + } + + if (next_smallest != nullptr) { + // Provide the next key outside the range covered by inputs + if (end_index < static_cast(files_[level].size())) { + **next_smallest = files_[level][end_index]->smallest; + } else { + *next_smallest = nullptr; + } + } +} + +uint64_t VersionStorageInfo::NumLevelBytes(int level) const { + assert(level >= 0); + assert(level < num_levels()); + return TotalFileSize(files_[level]); +} + +const char* VersionStorageInfo::LevelSummary( + LevelSummaryStorage* scratch) const { + int len = 0; + if (compaction_style_ == kCompactionStyleLevel && num_levels() > 1) { + assert(base_level_ < static_cast(level_max_bytes_.size())); + if (level_multiplier_ != 0.0) { + len = snprintf( + scratch->buffer, sizeof(scratch->buffer), + "base level %d level multiplier %.2f max bytes base %" PRIu64 " ", + base_level_, level_multiplier_, level_max_bytes_[base_level_]); + } + } + len += + snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "files["); + for (int i = 0; i < num_levels(); i++) { + int sz = sizeof(scratch->buffer) - len; + int ret = snprintf(scratch->buffer + len, sz, "%d ", int(files_[i].size())); + if (ret < 0 || ret >= sz) break; + len += ret; + } + if (len > 0) { + // overwrite the last space + --len; + } + len += snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, + "] max score %.2f", compaction_score_[0]); + + if (!files_marked_for_compaction_.empty()) { + snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, + " (%" ROCKSDB_PRIszt " files need compaction)", + files_marked_for_compaction_.size()); + } + + return scratch->buffer; +} + +const char* VersionStorageInfo::LevelFileSummary(FileSummaryStorage* scratch, + int level) const { + int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size["); + for (const auto& f : files_[level]) { + int sz = sizeof(scratch->buffer) - len; + char sztxt[16]; + AppendHumanBytes(f->fd.GetFileSize(), sztxt, sizeof(sztxt)); + int ret = snprintf(scratch->buffer + len, sz, + "#%" PRIu64 "(seq=%" PRIu64 ",sz=%s,%d) ", + f->fd.GetNumber(), f->fd.smallest_seqno, sztxt, + static_cast(f->being_compacted)); + if (ret < 0 || ret >= sz) + break; + len += ret; + } + // overwrite the last space (only if files_[level].size() is non-zero) + if (files_[level].size() && len > 0) { + --len; + } + snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]"); + return scratch->buffer; +} + +int64_t VersionStorageInfo::MaxNextLevelOverlappingBytes() { + uint64_t result = 0; + std::vector overlaps; + for (int level = 1; level < num_levels() - 1; level++) { + for (const auto& f : files_[level]) { + GetOverlappingInputs(level + 1, &f->smallest, &f->largest, &overlaps); + const uint64_t sum = TotalFileSize(overlaps); + if (sum > result) { + result = sum; + } + } + } + return result; +} + +uint64_t VersionStorageInfo::MaxBytesForLevel(int level) const { + // Note: the result for level zero is not really used since we set + // the level-0 compaction threshold based on number of files. + assert(level >= 0); + assert(level < static_cast(level_max_bytes_.size())); + return level_max_bytes_[level]; +} + +void VersionStorageInfo::CalculateBaseBytes(const ImmutableCFOptions& ioptions, + const MutableCFOptions& options) { + // Special logic to set number of sorted runs. + // It is to match the previous behavior when all files are in L0. + int num_l0_count = static_cast(files_[0].size()); + if (compaction_style_ == kCompactionStyleUniversal) { + // For universal compaction, we use level0 score to indicate + // compaction score for the whole DB. Adding other levels as if + // they are L0 files. + for (int i = 1; i < num_levels(); i++) { + if (!files_[i].empty()) { + num_l0_count++; + } + } + } + set_l0_delay_trigger_count(num_l0_count); + + level_max_bytes_.resize(ioptions.num_levels); + if (!ioptions.level_compaction_dynamic_level_bytes) { + base_level_ = (ioptions.compaction_style == kCompactionStyleLevel) ? 1 : -1; + + // Calculate for static bytes base case + for (int i = 0; i < ioptions.num_levels; ++i) { + if (i == 0 && ioptions.compaction_style == kCompactionStyleUniversal) { + level_max_bytes_[i] = options.max_bytes_for_level_base; + } else if (i > 1) { + level_max_bytes_[i] = MultiplyCheckOverflow( + MultiplyCheckOverflow(level_max_bytes_[i - 1], + options.max_bytes_for_level_multiplier), + options.MaxBytesMultiplerAdditional(i - 1)); + } else { + level_max_bytes_[i] = options.max_bytes_for_level_base; + } + } + } else { + uint64_t max_level_size = 0; + + int first_non_empty_level = -1; + // Find size of non-L0 level of most data. + // Cannot use the size of the last level because it can be empty or less + // than previous levels after compaction. + for (int i = 1; i < num_levels_; i++) { + uint64_t total_size = 0; + for (const auto& f : files_[i]) { + total_size += f->fd.GetFileSize(); + } + if (total_size > 0 && first_non_empty_level == -1) { + first_non_empty_level = i; + } + if (total_size > max_level_size) { + max_level_size = total_size; + } + } + + // Prefill every level's max bytes to disallow compaction from there. + for (int i = 0; i < num_levels_; i++) { + level_max_bytes_[i] = std::numeric_limits::max(); + } + + if (max_level_size == 0) { + // No data for L1 and up. L0 compacts to last level directly. + // No compaction from L1+ needs to be scheduled. + base_level_ = num_levels_ - 1; + } else { + uint64_t l0_size = 0; + for (const auto& f : files_[0]) { + l0_size += f->fd.GetFileSize(); + } + + uint64_t base_bytes_max = + std::max(options.max_bytes_for_level_base, l0_size); + uint64_t base_bytes_min = static_cast( + base_bytes_max / options.max_bytes_for_level_multiplier); + + // Try whether we can make last level's target size to be max_level_size + uint64_t cur_level_size = max_level_size; + for (int i = num_levels_ - 2; i >= first_non_empty_level; i--) { + // Round up after dividing + cur_level_size = static_cast( + cur_level_size / options.max_bytes_for_level_multiplier); + } + + // Calculate base level and its size. + uint64_t base_level_size; + if (cur_level_size <= base_bytes_min) { + // Case 1. If we make target size of last level to be max_level_size, + // target size of the first non-empty level would be smaller than + // base_bytes_min. We set it be base_bytes_min. + base_level_size = base_bytes_min + 1U; + base_level_ = first_non_empty_level; + ROCKS_LOG_INFO(ioptions.info_log, + "More existing levels in DB than needed. " + "max_bytes_for_level_multiplier may not be guaranteed."); + } else { + // Find base level (where L0 data is compacted to). + base_level_ = first_non_empty_level; + while (base_level_ > 1 && cur_level_size > base_bytes_max) { + --base_level_; + cur_level_size = static_cast( + cur_level_size / options.max_bytes_for_level_multiplier); + } + if (cur_level_size > base_bytes_max) { + // Even L1 will be too large + assert(base_level_ == 1); + base_level_size = base_bytes_max; + } else { + base_level_size = cur_level_size; + } + } + + level_multiplier_ = options.max_bytes_for_level_multiplier; + assert(base_level_size > 0); + if (l0_size > base_level_size && + (l0_size > options.max_bytes_for_level_base || + static_cast(files_[0].size() / 2) >= + options.level0_file_num_compaction_trigger)) { + // We adjust the base level according to actual L0 size, and adjust + // the level multiplier accordingly, when: + // 1. the L0 size is larger than level size base, or + // 2. number of L0 files reaches twice the L0->L1 compaction trigger + // We don't do this otherwise to keep the LSM-tree structure stable + // unless the L0 compation is backlogged. + base_level_size = l0_size; + if (base_level_ == num_levels_ - 1) { + level_multiplier_ = 1.0; + } else { + level_multiplier_ = std::pow( + static_cast(max_level_size) / + static_cast(base_level_size), + 1.0 / static_cast(num_levels_ - base_level_ - 1)); + } + } + + uint64_t level_size = base_level_size; + for (int i = base_level_; i < num_levels_; i++) { + if (i > base_level_) { + level_size = MultiplyCheckOverflow(level_size, level_multiplier_); + } + // Don't set any level below base_bytes_max. Otherwise, the LSM can + // assume an hourglass shape where L1+ sizes are smaller than L0. This + // causes compaction scoring, which depends on level sizes, to favor L1+ + // at the expense of L0, which may fill up and stall. + level_max_bytes_[i] = std::max(level_size, base_bytes_max); + } + } + } +} + +uint64_t VersionStorageInfo::EstimateLiveDataSize() const { + // Estimate the live data size by adding up the size of the last level for all + // key ranges. Note: Estimate depends on the ordering of files in level 0 + // because files in level 0 can be overlapping. + uint64_t size = 0; + + auto ikey_lt = [this](InternalKey* x, InternalKey* y) { + return internal_comparator_->Compare(*x, *y) < 0; + }; + // (Ordered) map of largest keys in non-overlapping files + std::map ranges(ikey_lt); + + for (int l = num_levels_ - 1; l >= 0; l--) { + bool found_end = false; + for (auto file : files_[l]) { + // Find the first file where the largest key is larger than the smallest + // key of the current file. If this file does not overlap with the + // current file, none of the files in the map does. If there is + // no potential overlap, we can safely insert the rest of this level + // (if the level is not 0) into the map without checking again because + // the elements in the level are sorted and non-overlapping. + auto lb = (found_end && l != 0) ? + ranges.end() : ranges.lower_bound(&file->smallest); + found_end = (lb == ranges.end()); + if (found_end || internal_comparator_->Compare( + file->largest, (*lb).second->smallest) < 0) { + ranges.emplace_hint(lb, &file->largest, file); + size += file->fd.file_size; + } + } + } + return size; +} + +bool VersionStorageInfo::RangeMightExistAfterSortedRun( + const Slice& smallest_user_key, const Slice& largest_user_key, + int last_level, int last_l0_idx) { + assert((last_l0_idx != -1) == (last_level == 0)); + // TODO(ajkr): this preserves earlier behavior where we considered an L0 file + // bottommost only if it's the oldest L0 file and there are no files on older + // levels. It'd be better to consider it bottommost if there's no overlap in + // older levels/files. + if (last_level == 0 && + last_l0_idx != static_cast(LevelFiles(0).size() - 1)) { + return true; + } + + // Checks whether there are files living beyond the `last_level`. If lower + // levels have files, it checks for overlap between [`smallest_key`, + // `largest_key`] and those files. Bottomlevel optimizations can be made if + // there are no files in lower levels or if there is no overlap with the files + // in the lower levels. + for (int level = last_level + 1; level < num_levels(); level++) { + // The range is not in the bottommost level if there are files in lower + // levels when the `last_level` is 0 or if there are files in lower levels + // which overlap with [`smallest_key`, `largest_key`]. + if (files_[level].size() > 0 && + (last_level == 0 || + OverlapInLevel(level, &smallest_user_key, &largest_user_key))) { + return true; + } + } + return false; +} + +void Version::AddLiveFiles(std::vector* live) { + for (int level = 0; level < storage_info_.num_levels(); level++) { + const std::vector& files = storage_info_.files_[level]; + for (const auto& file : files) { + live->push_back(file->fd); + } + } +} + +std::string Version::DebugString(bool hex, bool print_stats) const { + std::string r; + for (int level = 0; level < storage_info_.num_levels_; level++) { + // E.g., + // --- level 1 --- + // 17:123[1 .. 124]['a' .. 'd'] + // 20:43[124 .. 128]['e' .. 'g'] + // + // if print_stats=true: + // 17:123[1 .. 124]['a' .. 'd'](4096) + r.append("--- level "); + AppendNumberTo(&r, level); + r.append(" --- version# "); + AppendNumberTo(&r, version_number_); + r.append(" ---\n"); + const std::vector& files = storage_info_.files_[level]; + for (size_t i = 0; i < files.size(); i++) { + r.push_back(' '); + AppendNumberTo(&r, files[i]->fd.GetNumber()); + r.push_back(':'); + AppendNumberTo(&r, files[i]->fd.GetFileSize()); + r.append("["); + AppendNumberTo(&r, files[i]->fd.smallest_seqno); + r.append(" .. "); + AppendNumberTo(&r, files[i]->fd.largest_seqno); + r.append("]"); + r.append("["); + r.append(files[i]->smallest.DebugString(hex)); + r.append(" .. "); + r.append(files[i]->largest.DebugString(hex)); + r.append("]"); + if (files[i]->oldest_blob_file_number != kInvalidBlobFileNumber) { + r.append(" blob_file:"); + AppendNumberTo(&r, files[i]->oldest_blob_file_number); + } + if (print_stats) { + r.append("("); + r.append(ToString( + files[i]->stats.num_reads_sampled.load(std::memory_order_relaxed))); + r.append(")"); + } + r.append("\n"); + } + } + return r; +} + +// this is used to batch writes to the manifest file +struct VersionSet::ManifestWriter { + Status status; + bool done; + InstrumentedCondVar cv; + ColumnFamilyData* cfd; + const MutableCFOptions mutable_cf_options; + const autovector& edit_list; + + explicit ManifestWriter(InstrumentedMutex* mu, ColumnFamilyData* _cfd, + const MutableCFOptions& cf_options, + const autovector& e) + : done(false), + cv(mu), + cfd(_cfd), + mutable_cf_options(cf_options), + edit_list(e) {} +}; + +Status AtomicGroupReadBuffer::AddEdit(VersionEdit* edit) { + assert(edit); + if (edit->is_in_atomic_group_) { + TEST_SYNC_POINT("AtomicGroupReadBuffer::AddEdit:AtomicGroup"); + if (replay_buffer_.empty()) { + replay_buffer_.resize(edit->remaining_entries_ + 1); + TEST_SYNC_POINT_CALLBACK( + "AtomicGroupReadBuffer::AddEdit:FirstInAtomicGroup", edit); + } + read_edits_in_atomic_group_++; + if (read_edits_in_atomic_group_ + edit->remaining_entries_ != + static_cast(replay_buffer_.size())) { + TEST_SYNC_POINT_CALLBACK( + "AtomicGroupReadBuffer::AddEdit:IncorrectAtomicGroupSize", edit); + return Status::Corruption("corrupted atomic group"); + } + replay_buffer_[read_edits_in_atomic_group_ - 1] = *edit; + if (read_edits_in_atomic_group_ == replay_buffer_.size()) { + TEST_SYNC_POINT_CALLBACK( + "AtomicGroupReadBuffer::AddEdit:LastInAtomicGroup", edit); + return Status::OK(); + } + return Status::OK(); + } + + // A normal edit. + if (!replay_buffer().empty()) { + TEST_SYNC_POINT_CALLBACK( + "AtomicGroupReadBuffer::AddEdit:AtomicGroupMixedWithNormalEdits", edit); + return Status::Corruption("corrupted atomic group"); + } + return Status::OK(); +} + +bool AtomicGroupReadBuffer::IsFull() const { + return read_edits_in_atomic_group_ == replay_buffer_.size(); +} + +bool AtomicGroupReadBuffer::IsEmpty() const { return replay_buffer_.empty(); } + +void AtomicGroupReadBuffer::Clear() { + read_edits_in_atomic_group_ = 0; + replay_buffer_.clear(); +} + +VersionSet::VersionSet(const std::string& dbname, + const ImmutableDBOptions* _db_options, + const FileOptions& storage_options, Cache* table_cache, + WriteBufferManager* write_buffer_manager, + WriteController* write_controller, + BlockCacheTracer* const block_cache_tracer) + : column_family_set_(new ColumnFamilySet( + dbname, _db_options, storage_options, table_cache, + write_buffer_manager, write_controller, block_cache_tracer)), + env_(_db_options->env), + fs_(_db_options->fs.get()), + dbname_(dbname), + db_options_(_db_options), + next_file_number_(2), + manifest_file_number_(0), // Filled by Recover() + options_file_number_(0), + pending_manifest_file_number_(0), + last_sequence_(0), + last_allocated_sequence_(0), + last_published_sequence_(0), + prev_log_number_(0), + current_version_number_(0), + manifest_file_size_(0), + file_options_(storage_options), + block_cache_tracer_(block_cache_tracer) {} + +VersionSet::~VersionSet() { + // we need to delete column_family_set_ because its destructor depends on + // VersionSet + Cache* table_cache = column_family_set_->get_table_cache(); + column_family_set_.reset(); + for (auto& file : obsolete_files_) { + if (file.metadata->table_reader_handle) { + table_cache->Release(file.metadata->table_reader_handle); + TableCache::Evict(table_cache, file.metadata->fd.GetNumber()); + } + file.DeleteMetadata(); + } + obsolete_files_.clear(); +} + +void VersionSet::AppendVersion(ColumnFamilyData* column_family_data, + Version* v) { + // compute new compaction score + v->storage_info()->ComputeCompactionScore( + *column_family_data->ioptions(), + *column_family_data->GetLatestMutableCFOptions()); + + // Mark v finalized + v->storage_info_.SetFinalized(); + + // Make "v" current + assert(v->refs_ == 0); + Version* current = column_family_data->current(); + assert(v != current); + if (current != nullptr) { + assert(current->refs_ > 0); + current->Unref(); + } + column_family_data->SetCurrent(v); + v->Ref(); + + // Append to linked list + v->prev_ = column_family_data->dummy_versions()->prev_; + v->next_ = column_family_data->dummy_versions(); + v->prev_->next_ = v; + v->next_->prev_ = v; +} + +Status VersionSet::ProcessManifestWrites( + std::deque& writers, InstrumentedMutex* mu, + Directory* db_directory, bool new_descriptor_log, + const ColumnFamilyOptions* new_cf_options) { + assert(!writers.empty()); + ManifestWriter& first_writer = writers.front(); + ManifestWriter* last_writer = &first_writer; + + assert(!manifest_writers_.empty()); + assert(manifest_writers_.front() == &first_writer); + + autovector batch_edits; + autovector versions; + autovector mutable_cf_options_ptrs; + std::vector> builder_guards; + + if (first_writer.edit_list.front()->IsColumnFamilyManipulation()) { + // No group commits for column family add or drop + LogAndApplyCFHelper(first_writer.edit_list.front()); + batch_edits.push_back(first_writer.edit_list.front()); + } else { + auto it = manifest_writers_.cbegin(); + size_t group_start = std::numeric_limits::max(); + while (it != manifest_writers_.cend()) { + if ((*it)->edit_list.front()->IsColumnFamilyManipulation()) { + // no group commits for column family add or drop + break; + } + last_writer = *(it++); + assert(last_writer != nullptr); + assert(last_writer->cfd != nullptr); + if (last_writer->cfd->IsDropped()) { + // If we detect a dropped CF at this point, and the corresponding + // version edits belong to an atomic group, then we need to find out + // the preceding version edits in the same atomic group, and update + // their `remaining_entries_` member variable because we are NOT going + // to write the version edits' of dropped CF to the MANIFEST. If we + // don't update, then Recover can report corrupted atomic group because + // the `remaining_entries_` do not match. + if (!batch_edits.empty()) { + if (batch_edits.back()->is_in_atomic_group_ && + batch_edits.back()->remaining_entries_ > 0) { + assert(group_start < batch_edits.size()); + const auto& edit_list = last_writer->edit_list; + size_t k = 0; + while (k < edit_list.size()) { + if (!edit_list[k]->is_in_atomic_group_) { + break; + } else if (edit_list[k]->remaining_entries_ == 0) { + ++k; + break; + } + ++k; + } + for (auto i = group_start; i < batch_edits.size(); ++i) { + assert(static_cast(k) <= + batch_edits.back()->remaining_entries_); + batch_edits[i]->remaining_entries_ -= static_cast(k); + } + } + } + continue; + } + // We do a linear search on versions because versions is small. + // TODO(yanqin) maybe consider unordered_map + Version* version = nullptr; + VersionBuilder* builder = nullptr; + for (int i = 0; i != static_cast(versions.size()); ++i) { + uint32_t cf_id = last_writer->cfd->GetID(); + if (versions[i]->cfd()->GetID() == cf_id) { + version = versions[i]; + assert(!builder_guards.empty() && + builder_guards.size() == versions.size()); + builder = builder_guards[i]->version_builder(); + TEST_SYNC_POINT_CALLBACK( + "VersionSet::ProcessManifestWrites:SameColumnFamily", &cf_id); + break; + } + } + if (version == nullptr) { + version = new Version(last_writer->cfd, this, file_options_, + last_writer->mutable_cf_options, + current_version_number_++); + versions.push_back(version); + mutable_cf_options_ptrs.push_back(&last_writer->mutable_cf_options); + builder_guards.emplace_back( + new BaseReferencedVersionBuilder(last_writer->cfd)); + builder = builder_guards.back()->version_builder(); + } + assert(builder != nullptr); // make checker happy + for (const auto& e : last_writer->edit_list) { + if (e->is_in_atomic_group_) { + if (batch_edits.empty() || !batch_edits.back()->is_in_atomic_group_ || + (batch_edits.back()->is_in_atomic_group_ && + batch_edits.back()->remaining_entries_ == 0)) { + group_start = batch_edits.size(); + } + } else if (group_start != std::numeric_limits::max()) { + group_start = std::numeric_limits::max(); + } + Status s = LogAndApplyHelper(last_writer->cfd, builder, e, mu); + if (!s.ok()) { + // free up the allocated memory + for (auto v : versions) { + delete v; + } + return s; + } + batch_edits.push_back(e); + } + } + for (int i = 0; i < static_cast(versions.size()); ++i) { + assert(!builder_guards.empty() && + builder_guards.size() == versions.size()); + auto* builder = builder_guards[i]->version_builder(); + Status s = builder->SaveTo(versions[i]->storage_info()); + if (!s.ok()) { + // free up the allocated memory + for (auto v : versions) { + delete v; + } + return s; + } + } + } + +#ifndef NDEBUG + // Verify that version edits of atomic groups have correct + // remaining_entries_. + size_t k = 0; + while (k < batch_edits.size()) { + while (k < batch_edits.size() && !batch_edits[k]->is_in_atomic_group_) { + ++k; + } + if (k == batch_edits.size()) { + break; + } + size_t i = k; + while (i < batch_edits.size()) { + if (!batch_edits[i]->is_in_atomic_group_) { + break; + } + assert(i - k + batch_edits[i]->remaining_entries_ == + batch_edits[k]->remaining_entries_); + if (batch_edits[i]->remaining_entries_ == 0) { + ++i; + break; + } + ++i; + } + assert(batch_edits[i - 1]->is_in_atomic_group_); + assert(0 == batch_edits[i - 1]->remaining_entries_); + std::vector tmp; + for (size_t j = k; j != i; ++j) { + tmp.emplace_back(batch_edits[j]); + } + TEST_SYNC_POINT_CALLBACK( + "VersionSet::ProcessManifestWrites:CheckOneAtomicGroup", &tmp); + k = i; + } +#endif // NDEBUG + + uint64_t new_manifest_file_size = 0; + Status s; + + assert(pending_manifest_file_number_ == 0); + if (!descriptor_log_ || + manifest_file_size_ > db_options_->max_manifest_file_size) { + TEST_SYNC_POINT("VersionSet::ProcessManifestWrites:BeforeNewManifest"); + new_descriptor_log = true; + } else { + pending_manifest_file_number_ = manifest_file_number_; + } + + // Local cached copy of state variable(s). WriteCurrentStateToManifest() + // reads its content after releasing db mutex to avoid race with + // SwitchMemtable(). + std::unordered_map curr_state; + if (new_descriptor_log) { + pending_manifest_file_number_ = NewFileNumber(); + batch_edits.back()->SetNextFile(next_file_number_.load()); + + // if we are writing out new snapshot make sure to persist max column + // family. + if (column_family_set_->GetMaxColumnFamily() > 0) { + first_writer.edit_list.front()->SetMaxColumnFamily( + column_family_set_->GetMaxColumnFamily()); + } + for (const auto* cfd : *column_family_set_) { + assert(curr_state.find(cfd->GetID()) == curr_state.end()); + curr_state[cfd->GetID()] = {cfd->GetLogNumber()}; + } + } + + { + FileOptions opt_file_opts = fs_->OptimizeForManifestWrite(file_options_); + mu->Unlock(); + + TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifest"); + if (!first_writer.edit_list.front()->IsColumnFamilyManipulation()) { + for (int i = 0; i < static_cast(versions.size()); ++i) { + assert(!builder_guards.empty() && + builder_guards.size() == versions.size()); + assert(!mutable_cf_options_ptrs.empty() && + builder_guards.size() == versions.size()); + ColumnFamilyData* cfd = versions[i]->cfd_; + s = builder_guards[i]->version_builder()->LoadTableHandlers( + cfd->internal_stats(), cfd->ioptions()->optimize_filters_for_hits, + true /* prefetch_index_and_filter_in_cache */, + false /* is_initial_load */, + mutable_cf_options_ptrs[i]->prefix_extractor.get()); + if (!s.ok()) { + if (db_options_->paranoid_checks) { + break; + } + s = Status::OK(); + } + } + } + + if (s.ok() && new_descriptor_log) { + // This is fine because everything inside of this block is serialized -- + // only one thread can be here at the same time + // create new manifest file + ROCKS_LOG_INFO(db_options_->info_log, "Creating manifest %" PRIu64 "\n", + pending_manifest_file_number_); + std::string descriptor_fname = + DescriptorFileName(dbname_, pending_manifest_file_number_); + std::unique_ptr descriptor_file; + s = NewWritableFile(fs_, descriptor_fname, &descriptor_file, + opt_file_opts); + if (s.ok()) { + descriptor_file->SetPreallocationBlockSize( + db_options_->manifest_preallocation_size); + + std::unique_ptr file_writer(new WritableFileWriter( + std::move(descriptor_file), descriptor_fname, opt_file_opts, env_, + nullptr, db_options_->listeners)); + descriptor_log_.reset( + new log::Writer(std::move(file_writer), 0, false)); + s = WriteCurrentStateToManifest(curr_state, descriptor_log_.get()); + } + } + + if (s.ok()) { + if (!first_writer.edit_list.front()->IsColumnFamilyManipulation()) { + for (int i = 0; i < static_cast(versions.size()); ++i) { + versions[i]->PrepareApply(*mutable_cf_options_ptrs[i], true); + } + } + + // Write new records to MANIFEST log +#ifndef NDEBUG + size_t idx = 0; +#endif + for (auto& e : batch_edits) { + std::string record; + if (!e->EncodeTo(&record)) { + s = Status::Corruption("Unable to encode VersionEdit:" + + e->DebugString(true)); + break; + } + TEST_KILL_RANDOM("VersionSet::LogAndApply:BeforeAddRecord", + rocksdb_kill_odds * REDUCE_ODDS2); +#ifndef NDEBUG + if (batch_edits.size() > 1 && batch_edits.size() - 1 == idx) { + TEST_SYNC_POINT_CALLBACK( + "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0", + nullptr); + TEST_SYNC_POINT( + "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:1"); + } + ++idx; +#endif /* !NDEBUG */ + s = descriptor_log_->AddRecord(record); + if (!s.ok()) { + break; + } + } + if (s.ok()) { + s = SyncManifest(env_, db_options_, descriptor_log_->file()); + } + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_->info_log, "MANIFEST write %s\n", + s.ToString().c_str()); + } + } + + // If we just created a new descriptor file, install it by writing a + // new CURRENT file that points to it. + if (s.ok() && new_descriptor_log) { + s = SetCurrentFile(env_, dbname_, pending_manifest_file_number_, + db_directory); + TEST_SYNC_POINT("VersionSet::ProcessManifestWrites:AfterNewManifest"); + } + + if (s.ok()) { + // find offset in manifest file where this version is stored. + new_manifest_file_size = descriptor_log_->file()->GetFileSize(); + } + + if (first_writer.edit_list.front()->is_column_family_drop_) { + TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:0"); + TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:1"); + TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:2"); + } + + LogFlush(db_options_->info_log); + TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifestDone"); + mu->Lock(); + } + + // Append the old manifest file to the obsolete_manifest_ list to be deleted + // by PurgeObsoleteFiles later. + if (s.ok() && new_descriptor_log) { + obsolete_manifests_.emplace_back( + DescriptorFileName("", manifest_file_number_)); + } + + // Install the new versions + if (s.ok()) { + if (first_writer.edit_list.front()->is_column_family_add_) { + assert(batch_edits.size() == 1); + assert(new_cf_options != nullptr); + CreateColumnFamily(*new_cf_options, first_writer.edit_list.front()); + } else if (first_writer.edit_list.front()->is_column_family_drop_) { + assert(batch_edits.size() == 1); + first_writer.cfd->SetDropped(); + first_writer.cfd->UnrefAndTryDelete(); + } else { + // Each version in versions corresponds to a column family. + // For each column family, update its log number indicating that logs + // with number smaller than this should be ignored. + for (const auto version : versions) { + uint64_t max_log_number_in_batch = 0; + uint32_t cf_id = version->cfd_->GetID(); + for (const auto& e : batch_edits) { + if (e->has_log_number_ && e->column_family_ == cf_id) { + max_log_number_in_batch = + std::max(max_log_number_in_batch, e->log_number_); + } + } + if (max_log_number_in_batch != 0) { + assert(version->cfd_->GetLogNumber() <= max_log_number_in_batch); + version->cfd_->SetLogNumber(max_log_number_in_batch); + } + } + + uint64_t last_min_log_number_to_keep = 0; + for (auto& e : batch_edits) { + if (e->has_min_log_number_to_keep_) { + last_min_log_number_to_keep = + std::max(last_min_log_number_to_keep, e->min_log_number_to_keep_); + } + } + + if (last_min_log_number_to_keep != 0) { + // Should only be set in 2PC mode. + MarkMinLogNumberToKeep2PC(last_min_log_number_to_keep); + } + + for (int i = 0; i < static_cast(versions.size()); ++i) { + ColumnFamilyData* cfd = versions[i]->cfd_; + AppendVersion(cfd, versions[i]); + } + } + manifest_file_number_ = pending_manifest_file_number_; + manifest_file_size_ = new_manifest_file_size; + prev_log_number_ = first_writer.edit_list.front()->prev_log_number_; + } else { + std::string version_edits; + for (auto& e : batch_edits) { + version_edits += ("\n" + e->DebugString(true)); + } + ROCKS_LOG_ERROR(db_options_->info_log, + "Error in committing version edit to MANIFEST: %s", + version_edits.c_str()); + for (auto v : versions) { + delete v; + } + // If manifest append failed for whatever reason, the file could be + // corrupted. So we need to force the next version update to start a + // new manifest file. + descriptor_log_.reset(); + if (new_descriptor_log) { + ROCKS_LOG_INFO(db_options_->info_log, + "Deleting manifest %" PRIu64 " current manifest %" PRIu64 + "\n", + manifest_file_number_, pending_manifest_file_number_); + env_->DeleteFile( + DescriptorFileName(dbname_, pending_manifest_file_number_)); + } + } + + pending_manifest_file_number_ = 0; + + // wake up all the waiting writers + while (true) { + ManifestWriter* ready = manifest_writers_.front(); + manifest_writers_.pop_front(); + bool need_signal = true; + for (const auto& w : writers) { + if (&w == ready) { + need_signal = false; + break; + } + } + ready->status = s; + ready->done = true; + if (need_signal) { + ready->cv.Signal(); + } + if (ready == last_writer) { + break; + } + } + if (!manifest_writers_.empty()) { + manifest_writers_.front()->cv.Signal(); + } + return s; +} + +// 'datas' is gramatically incorrect. We still use this notation to indicate +// that this variable represents a collection of column_family_data. +Status VersionSet::LogAndApply( + const autovector& column_family_datas, + const autovector& mutable_cf_options_list, + const autovector>& edit_lists, + InstrumentedMutex* mu, Directory* db_directory, bool new_descriptor_log, + const ColumnFamilyOptions* new_cf_options) { + mu->AssertHeld(); + int num_edits = 0; + for (const auto& elist : edit_lists) { + num_edits += static_cast(elist.size()); + } + if (num_edits == 0) { + return Status::OK(); + } else if (num_edits > 1) { +#ifndef NDEBUG + for (const auto& edit_list : edit_lists) { + for (const auto& edit : edit_list) { + assert(!edit->IsColumnFamilyManipulation()); + } + } +#endif /* ! NDEBUG */ + } + + int num_cfds = static_cast(column_family_datas.size()); + if (num_cfds == 1 && column_family_datas[0] == nullptr) { + assert(edit_lists.size() == 1 && edit_lists[0].size() == 1); + assert(edit_lists[0][0]->is_column_family_add_); + assert(new_cf_options != nullptr); + } + std::deque writers; + if (num_cfds > 0) { + assert(static_cast(num_cfds) == mutable_cf_options_list.size()); + assert(static_cast(num_cfds) == edit_lists.size()); + } + for (int i = 0; i < num_cfds; ++i) { + writers.emplace_back(mu, column_family_datas[i], + *mutable_cf_options_list[i], edit_lists[i]); + manifest_writers_.push_back(&writers[i]); + } + assert(!writers.empty()); + ManifestWriter& first_writer = writers.front(); + while (!first_writer.done && &first_writer != manifest_writers_.front()) { + first_writer.cv.Wait(); + } + if (first_writer.done) { + // All non-CF-manipulation operations can be grouped together and committed + // to MANIFEST. They should all have finished. The status code is stored in + // the first manifest writer. +#ifndef NDEBUG + for (const auto& writer : writers) { + assert(writer.done); + } +#endif /* !NDEBUG */ + return first_writer.status; + } + + int num_undropped_cfds = 0; + for (auto cfd : column_family_datas) { + // if cfd == nullptr, it is a column family add. + if (cfd == nullptr || !cfd->IsDropped()) { + ++num_undropped_cfds; + } + } + if (0 == num_undropped_cfds) { + for (int i = 0; i != num_cfds; ++i) { + manifest_writers_.pop_front(); + } + // Notify new head of manifest write queue. + if (!manifest_writers_.empty()) { + manifest_writers_.front()->cv.Signal(); + } + return Status::ColumnFamilyDropped(); + } + + return ProcessManifestWrites(writers, mu, db_directory, new_descriptor_log, + new_cf_options); +} + +void VersionSet::LogAndApplyCFHelper(VersionEdit* edit) { + assert(edit->IsColumnFamilyManipulation()); + edit->SetNextFile(next_file_number_.load()); + // The log might have data that is not visible to memtbale and hence have not + // updated the last_sequence_ yet. It is also possible that the log has is + // expecting some new data that is not written yet. Since LastSequence is an + // upper bound on the sequence, it is ok to record + // last_allocated_sequence_ as the last sequence. + edit->SetLastSequence(db_options_->two_write_queues ? last_allocated_sequence_ + : last_sequence_); + if (edit->is_column_family_drop_) { + // if we drop column family, we have to make sure to save max column family, + // so that we don't reuse existing ID + edit->SetMaxColumnFamily(column_family_set_->GetMaxColumnFamily()); + } +} + +Status VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd, + VersionBuilder* builder, VersionEdit* edit, + InstrumentedMutex* mu) { +#ifdef NDEBUG + (void)cfd; +#endif + mu->AssertHeld(); + assert(!edit->IsColumnFamilyManipulation()); + + if (edit->has_log_number_) { + assert(edit->log_number_ >= cfd->GetLogNumber()); + assert(edit->log_number_ < next_file_number_.load()); + } + + if (!edit->has_prev_log_number_) { + edit->SetPrevLogNumber(prev_log_number_); + } + edit->SetNextFile(next_file_number_.load()); + // The log might have data that is not visible to memtbale and hence have not + // updated the last_sequence_ yet. It is also possible that the log has is + // expecting some new data that is not written yet. Since LastSequence is an + // upper bound on the sequence, it is ok to record + // last_allocated_sequence_ as the last sequence. + edit->SetLastSequence(db_options_->two_write_queues ? last_allocated_sequence_ + : last_sequence_); + + Status s = builder->Apply(edit); + + return s; +} + +Status VersionSet::ApplyOneVersionEditToBuilder( + VersionEdit& edit, + const std::unordered_map& name_to_options, + std::unordered_map& column_families_not_found, + std::unordered_map>& + builders, + VersionEditParams* version_edit_params) { + // Not found means that user didn't supply that column + // family option AND we encountered column family add + // record. Once we encounter column family drop record, + // we will delete the column family from + // column_families_not_found. + bool cf_in_not_found = (column_families_not_found.find(edit.column_family_) != + column_families_not_found.end()); + // in builders means that user supplied that column family + // option AND that we encountered column family add record + bool cf_in_builders = builders.find(edit.column_family_) != builders.end(); + + // they can't both be true + assert(!(cf_in_not_found && cf_in_builders)); + + ColumnFamilyData* cfd = nullptr; + + if (edit.is_column_family_add_) { + if (cf_in_builders || cf_in_not_found) { + return Status::Corruption( + "Manifest adding the same column family twice: " + + edit.column_family_name_); + } + auto cf_options = name_to_options.find(edit.column_family_name_); + // implicitly add persistent_stats column family without requiring user + // to specify + bool is_persistent_stats_column_family = + edit.column_family_name_.compare(kPersistentStatsColumnFamilyName) == 0; + if (cf_options == name_to_options.end() && + !is_persistent_stats_column_family) { + column_families_not_found.insert( + {edit.column_family_, edit.column_family_name_}); + } else { + // recover persistent_stats CF from a DB that already contains it + if (is_persistent_stats_column_family) { + ColumnFamilyOptions cfo; + OptimizeForPersistentStats(&cfo); + cfd = CreateColumnFamily(cfo, &edit); + } else { + cfd = CreateColumnFamily(cf_options->second, &edit); + } + cfd->set_initialized(); + builders.insert(std::make_pair( + edit.column_family_, std::unique_ptr( + new BaseReferencedVersionBuilder(cfd)))); + } + } else if (edit.is_column_family_drop_) { + if (cf_in_builders) { + auto builder = builders.find(edit.column_family_); + assert(builder != builders.end()); + builders.erase(builder); + cfd = column_family_set_->GetColumnFamily(edit.column_family_); + assert(cfd != nullptr); + if (cfd->UnrefAndTryDelete()) { + cfd = nullptr; + } else { + // who else can have reference to cfd!? + assert(false); + } + } else if (cf_in_not_found) { + column_families_not_found.erase(edit.column_family_); + } else { + return Status::Corruption( + "Manifest - dropping non-existing column family"); + } + } else if (!cf_in_not_found) { + if (!cf_in_builders) { + return Status::Corruption( + "Manifest record referencing unknown column family"); + } + + cfd = column_family_set_->GetColumnFamily(edit.column_family_); + // this should never happen since cf_in_builders is true + assert(cfd != nullptr); + + // if it is not column family add or column family drop, + // then it's a file add/delete, which should be forwarded + // to builder + auto builder = builders.find(edit.column_family_); + assert(builder != builders.end()); + Status s = builder->second->version_builder()->Apply(&edit); + if (!s.ok()) { + return s; + } + } + return ExtractInfoFromVersionEdit(cfd, edit, version_edit_params); +} + +Status VersionSet::ExtractInfoFromVersionEdit( + ColumnFamilyData* cfd, const VersionEdit& from_edit, + VersionEditParams* version_edit_params) { + if (cfd != nullptr) { + if (from_edit.has_db_id_) { + version_edit_params->SetDBId(from_edit.db_id_); + } + if (from_edit.has_log_number_) { + if (cfd->GetLogNumber() > from_edit.log_number_) { + ROCKS_LOG_WARN( + db_options_->info_log, + "MANIFEST corruption detected, but ignored - Log numbers in " + "records NOT monotonically increasing"); + } else { + cfd->SetLogNumber(from_edit.log_number_); + version_edit_params->SetLogNumber(from_edit.log_number_); + } + } + if (from_edit.has_comparator_ && + from_edit.comparator_ != cfd->user_comparator()->Name()) { + return Status::InvalidArgument( + cfd->user_comparator()->Name(), + "does not match existing comparator " + from_edit.comparator_); + } + } + + if (from_edit.has_prev_log_number_) { + version_edit_params->SetPrevLogNumber(from_edit.prev_log_number_); + } + + if (from_edit.has_next_file_number_) { + version_edit_params->SetNextFile(from_edit.next_file_number_); + } + + if (from_edit.has_max_column_family_) { + version_edit_params->SetMaxColumnFamily(from_edit.max_column_family_); + } + + if (from_edit.has_min_log_number_to_keep_) { + version_edit_params->min_log_number_to_keep_ = + std::max(version_edit_params->min_log_number_to_keep_, + from_edit.min_log_number_to_keep_); + } + + if (from_edit.has_last_sequence_) { + version_edit_params->SetLastSequence(from_edit.last_sequence_); + } + return Status::OK(); +} + +Status VersionSet::GetCurrentManifestPath(const std::string& dbname, + FileSystem* fs, + std::string* manifest_path, + uint64_t* manifest_file_number) { + assert(fs != nullptr); + assert(manifest_path != nullptr); + assert(manifest_file_number != nullptr); + + std::string fname; + Status s = ReadFileToString(fs, CurrentFileName(dbname), &fname); + if (!s.ok()) { + return s; + } + if (fname.empty() || fname.back() != '\n') { + return Status::Corruption("CURRENT file does not end with newline"); + } + // remove the trailing '\n' + fname.resize(fname.size() - 1); + FileType type; + bool parse_ok = ParseFileName(fname, manifest_file_number, &type); + if (!parse_ok || type != kDescriptorFile) { + return Status::Corruption("CURRENT file corrupted"); + } + *manifest_path = dbname; + if (dbname.back() != '/') { + manifest_path->push_back('/'); + } + *manifest_path += fname; + return Status::OK(); +} + +Status VersionSet::ReadAndRecover( + log::Reader* reader, AtomicGroupReadBuffer* read_buffer, + const std::unordered_map& name_to_options, + std::unordered_map& column_families_not_found, + std::unordered_map>& + builders, + VersionEditParams* version_edit_params, std::string* db_id) { + assert(reader != nullptr); + assert(read_buffer != nullptr); + Status s; + Slice record; + std::string scratch; + size_t recovered_edits = 0; + while (reader->ReadRecord(&record, &scratch) && s.ok()) { + VersionEdit edit; + s = edit.DecodeFrom(record); + if (!s.ok()) { + break; + } + if (edit.has_db_id_) { + db_id_ = edit.GetDbId(); + if (db_id != nullptr) { + db_id->assign(edit.GetDbId()); + } + } + s = read_buffer->AddEdit(&edit); + if (!s.ok()) { + break; + } + if (edit.is_in_atomic_group_) { + if (read_buffer->IsFull()) { + // Apply edits in an atomic group when we have read all edits in the + // group. + for (auto& e : read_buffer->replay_buffer()) { + s = ApplyOneVersionEditToBuilder(e, name_to_options, + column_families_not_found, builders, + version_edit_params); + if (!s.ok()) { + break; + } + recovered_edits++; + } + if (!s.ok()) { + break; + } + read_buffer->Clear(); + } + } else { + // Apply a normal edit immediately. + s = ApplyOneVersionEditToBuilder(edit, name_to_options, + column_families_not_found, builders, + version_edit_params); + if (s.ok()) { + recovered_edits++; + } + } + } + if (!s.ok()) { + // Clear the buffer if we fail to decode/apply an edit. + read_buffer->Clear(); + } + TEST_SYNC_POINT_CALLBACK("VersionSet::ReadAndRecover:RecoveredEdits", + &recovered_edits); + return s; +} + +Status VersionSet::Recover( + const std::vector& column_families, bool read_only, + std::string* db_id) { + std::unordered_map cf_name_to_options; + for (const auto& cf : column_families) { + cf_name_to_options.emplace(cf.name, cf.options); + } + // keeps track of column families in manifest that were not found in + // column families parameters. if those column families are not dropped + // by subsequent manifest records, Recover() will return failure status + std::unordered_map column_families_not_found; + + // Read "CURRENT" file, which contains a pointer to the current manifest file + std::string manifest_path; + Status s = GetCurrentManifestPath(dbname_, fs_, &manifest_path, + &manifest_file_number_); + if (!s.ok()) { + return s; + } + + ROCKS_LOG_INFO(db_options_->info_log, "Recovering from manifest file: %s\n", + manifest_path.c_str()); + + std::unique_ptr manifest_file_reader; + { + std::unique_ptr manifest_file; + s = fs_->NewSequentialFile(manifest_path, + fs_->OptimizeForManifestRead(file_options_), + &manifest_file, nullptr); + if (!s.ok()) { + return s; + } + manifest_file_reader.reset( + new SequentialFileReader(std::move(manifest_file), manifest_path, + db_options_->log_readahead_size)); + } + + std::unordered_map> + builders; + + // add default column family + auto default_cf_iter = cf_name_to_options.find(kDefaultColumnFamilyName); + if (default_cf_iter == cf_name_to_options.end()) { + return Status::InvalidArgument("Default column family not specified"); + } + VersionEdit default_cf_edit; + default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName); + default_cf_edit.SetColumnFamily(0); + ColumnFamilyData* default_cfd = + CreateColumnFamily(default_cf_iter->second, &default_cf_edit); + // In recovery, nobody else can access it, so it's fine to set it to be + // initialized earlier. + default_cfd->set_initialized(); + builders.insert( + std::make_pair(0, std::unique_ptr( + new BaseReferencedVersionBuilder(default_cfd)))); + uint64_t current_manifest_file_size = 0; + VersionEditParams version_edit_params; + { + VersionSet::LogReporter reporter; + reporter.status = &s; + log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter, + true /* checksum */, 0 /* log_number */); + Slice record; + std::string scratch; + AtomicGroupReadBuffer read_buffer; + s = ReadAndRecover(&reader, &read_buffer, cf_name_to_options, + column_families_not_found, builders, + &version_edit_params, db_id); + current_manifest_file_size = reader.GetReadOffset(); + assert(current_manifest_file_size != 0); + } + + if (s.ok()) { + if (!version_edit_params.has_next_file_number_) { + s = Status::Corruption("no meta-nextfile entry in descriptor"); + } else if (!version_edit_params.has_log_number_) { + s = Status::Corruption("no meta-lognumber entry in descriptor"); + } else if (!version_edit_params.has_last_sequence_) { + s = Status::Corruption("no last-sequence-number entry in descriptor"); + } + + if (!version_edit_params.has_prev_log_number_) { + version_edit_params.SetPrevLogNumber(0); + } + + column_family_set_->UpdateMaxColumnFamily( + version_edit_params.max_column_family_); + + // When reading DB generated using old release, min_log_number_to_keep=0. + // All log files will be scanned for potential prepare entries. + MarkMinLogNumberToKeep2PC(version_edit_params.min_log_number_to_keep_); + MarkFileNumberUsed(version_edit_params.prev_log_number_); + MarkFileNumberUsed(version_edit_params.log_number_); + } + + // there were some column families in the MANIFEST that weren't specified + // in the argument. This is OK in read_only mode + if (read_only == false && !column_families_not_found.empty()) { + std::string list_of_not_found; + for (const auto& cf : column_families_not_found) { + list_of_not_found += ", " + cf.second; + } + list_of_not_found = list_of_not_found.substr(2); + s = Status::InvalidArgument( + "You have to open all column families. Column families not opened: " + + list_of_not_found); + } + + if (s.ok()) { + for (auto cfd : *column_family_set_) { + assert(builders.count(cfd->GetID()) > 0); + auto* builder = builders[cfd->GetID()]->version_builder(); + if (!builder->CheckConsistencyForNumLevels()) { + s = Status::InvalidArgument( + "db has more levels than options.num_levels"); + break; + } + } + } + + if (s.ok()) { + for (auto cfd : *column_family_set_) { + if (cfd->IsDropped()) { + continue; + } + if (read_only) { + cfd->table_cache()->SetTablesAreImmortal(); + } + assert(cfd->initialized()); + auto builders_iter = builders.find(cfd->GetID()); + assert(builders_iter != builders.end()); + auto builder = builders_iter->second->version_builder(); + + // unlimited table cache. Pre-load table handle now. + // Need to do it out of the mutex. + s = builder->LoadTableHandlers( + cfd->internal_stats(), db_options_->max_file_opening_threads, + false /* prefetch_index_and_filter_in_cache */, + true /* is_initial_load */, + cfd->GetLatestMutableCFOptions()->prefix_extractor.get()); + if (!s.ok()) { + if (db_options_->paranoid_checks) { + return s; + } + s = Status::OK(); + } + + Version* v = new Version(cfd, this, file_options_, + *cfd->GetLatestMutableCFOptions(), + current_version_number_++); + builder->SaveTo(v->storage_info()); + + // Install recovered version + v->PrepareApply(*cfd->GetLatestMutableCFOptions(), + !(db_options_->skip_stats_update_on_db_open)); + AppendVersion(cfd, v); + } + + manifest_file_size_ = current_manifest_file_size; + next_file_number_.store(version_edit_params.next_file_number_ + 1); + last_allocated_sequence_ = version_edit_params.last_sequence_; + last_published_sequence_ = version_edit_params.last_sequence_; + last_sequence_ = version_edit_params.last_sequence_; + prev_log_number_ = version_edit_params.prev_log_number_; + + ROCKS_LOG_INFO( + db_options_->info_log, + "Recovered from manifest file:%s succeeded," + "manifest_file_number is %" PRIu64 ", next_file_number is %" PRIu64 + ", last_sequence is %" PRIu64 ", log_number is %" PRIu64 + ",prev_log_number is %" PRIu64 ",max_column_family is %" PRIu32 + ",min_log_number_to_keep is %" PRIu64 "\n", + manifest_path.c_str(), manifest_file_number_, next_file_number_.load(), + last_sequence_.load(), version_edit_params.log_number_, + prev_log_number_, column_family_set_->GetMaxColumnFamily(), + min_log_number_to_keep_2pc()); + + for (auto cfd : *column_family_set_) { + if (cfd->IsDropped()) { + continue; + } + ROCKS_LOG_INFO(db_options_->info_log, + "Column family [%s] (ID %" PRIu32 + "), log number is %" PRIu64 "\n", + cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber()); + } + } + + return s; +} + +Status VersionSet::ListColumnFamilies(std::vector* column_families, + const std::string& dbname, + FileSystem* fs) { + // these are just for performance reasons, not correcntes, + // so we're fine using the defaults + FileOptions soptions; + // Read "CURRENT" file, which contains a pointer to the current manifest file + std::string manifest_path; + uint64_t manifest_file_number; + Status s = + GetCurrentManifestPath(dbname, fs, &manifest_path, &manifest_file_number); + if (!s.ok()) { + return s; + } + + std::unique_ptr file_reader; + { + std::unique_ptr file; + s = fs->NewSequentialFile(manifest_path, soptions, &file, nullptr); + if (!s.ok()) { + return s; + } + file_reader.reset(new SequentialFileReader(std::move(file), manifest_path)); + } + + std::map column_family_names; + // default column family is always implicitly there + column_family_names.insert({0, kDefaultColumnFamilyName}); + VersionSet::LogReporter reporter; + reporter.status = &s; + log::Reader reader(nullptr, std::move(file_reader), &reporter, + true /* checksum */, 0 /* log_number */); + Slice record; + std::string scratch; + while (reader.ReadRecord(&record, &scratch) && s.ok()) { + VersionEdit edit; + s = edit.DecodeFrom(record); + if (!s.ok()) { + break; + } + if (edit.is_column_family_add_) { + if (column_family_names.find(edit.column_family_) != + column_family_names.end()) { + s = Status::Corruption("Manifest adding the same column family twice"); + break; + } + column_family_names.insert( + {edit.column_family_, edit.column_family_name_}); + } else if (edit.is_column_family_drop_) { + if (column_family_names.find(edit.column_family_) == + column_family_names.end()) { + s = Status::Corruption( + "Manifest - dropping non-existing column family"); + break; + } + column_family_names.erase(edit.column_family_); + } + } + + column_families->clear(); + if (s.ok()) { + for (const auto& iter : column_family_names) { + column_families->push_back(iter.second); + } + } + + return s; +} + +#ifndef ROCKSDB_LITE +Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, + const Options* options, + const FileOptions& file_options, + int new_levels) { + if (new_levels <= 1) { + return Status::InvalidArgument( + "Number of levels needs to be bigger than 1"); + } + + ImmutableDBOptions db_options(*options); + ColumnFamilyOptions cf_options(*options); + std::shared_ptr tc(NewLRUCache(options->max_open_files - 10, + options->table_cache_numshardbits)); + WriteController wc(options->delayed_write_rate); + WriteBufferManager wb(options->db_write_buffer_size); + VersionSet versions(dbname, &db_options, file_options, tc.get(), &wb, &wc, + /*block_cache_tracer=*/nullptr); + Status status; + + std::vector dummy; + ColumnFamilyDescriptor dummy_descriptor(kDefaultColumnFamilyName, + ColumnFamilyOptions(*options)); + dummy.push_back(dummy_descriptor); + status = versions.Recover(dummy); + if (!status.ok()) { + return status; + } + + Version* current_version = + versions.GetColumnFamilySet()->GetDefault()->current(); + auto* vstorage = current_version->storage_info(); + int current_levels = vstorage->num_levels(); + + if (current_levels <= new_levels) { + return Status::OK(); + } + + // Make sure there are file only on one level from + // (new_levels-1) to (current_levels-1) + int first_nonempty_level = -1; + int first_nonempty_level_filenum = 0; + for (int i = new_levels - 1; i < current_levels; i++) { + int file_num = vstorage->NumLevelFiles(i); + if (file_num != 0) { + if (first_nonempty_level < 0) { + first_nonempty_level = i; + first_nonempty_level_filenum = file_num; + } else { + char msg[255]; + snprintf(msg, sizeof(msg), + "Found at least two levels containing files: " + "[%d:%d],[%d:%d].\n", + first_nonempty_level, first_nonempty_level_filenum, i, + file_num); + return Status::InvalidArgument(msg); + } + } + } + + // we need to allocate an array with the old number of levels size to + // avoid SIGSEGV in WriteCurrentStatetoManifest() + // however, all levels bigger or equal to new_levels will be empty + std::vector* new_files_list = + new std::vector[current_levels]; + for (int i = 0; i < new_levels - 1; i++) { + new_files_list[i] = vstorage->LevelFiles(i); + } + + if (first_nonempty_level > 0) { + new_files_list[new_levels - 1] = vstorage->LevelFiles(first_nonempty_level); + } + + delete[] vstorage -> files_; + vstorage->files_ = new_files_list; + vstorage->num_levels_ = new_levels; + + MutableCFOptions mutable_cf_options(*options); + VersionEdit ve; + InstrumentedMutex dummy_mutex; + InstrumentedMutexLock l(&dummy_mutex); + return versions.LogAndApply( + versions.GetColumnFamilySet()->GetDefault(), + mutable_cf_options, &ve, &dummy_mutex, nullptr, true); +} + +// Get the checksum information including the checksum and checksum function +// name of all SST files in VersionSet. Store the information in +// FileChecksumList which contains a map from file number to its checksum info. +// If DB is not running, make sure call VersionSet::Recover() to load the file +// metadata from Manifest to VersionSet before calling this function. +Status VersionSet::GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) { + // Clean the previously stored checksum information if any. + if (checksum_list == nullptr) { + return Status::InvalidArgument("checksum_list is nullptr"); + } + checksum_list->reset(); + + for (auto cfd : *column_family_set_) { + if (cfd->IsDropped() || !cfd->initialized()) { + continue; + } + for (int level = 0; level < cfd->NumberLevels(); level++) { + for (const auto& file : + cfd->current()->storage_info()->LevelFiles(level)) { + checksum_list->InsertOneFileChecksum(file->fd.GetNumber(), + file->file_checksum, + file->file_checksum_func_name); + } + } + } + return Status::OK(); +} + +Status VersionSet::DumpManifest(Options& options, std::string& dscname, + bool verbose, bool hex, bool json) { + // Open the specified manifest file. + std::unique_ptr file_reader; + Status s; + { + std::unique_ptr file; + s = options.file_system->NewSequentialFile( + dscname, + options.file_system->OptimizeForManifestRead(file_options_), &file, + nullptr); + if (!s.ok()) { + return s; + } + file_reader.reset(new SequentialFileReader( + std::move(file), dscname, db_options_->log_readahead_size)); + } + + bool have_prev_log_number = false; + bool have_next_file = false; + bool have_last_sequence = false; + uint64_t next_file = 0; + uint64_t last_sequence = 0; + uint64_t previous_log_number = 0; + int count = 0; + std::unordered_map comparators; + std::unordered_map> + builders; + + // add default column family + VersionEdit default_cf_edit; + default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName); + default_cf_edit.SetColumnFamily(0); + ColumnFamilyData* default_cfd = + CreateColumnFamily(ColumnFamilyOptions(options), &default_cf_edit); + builders.insert( + std::make_pair(0, std::unique_ptr( + new BaseReferencedVersionBuilder(default_cfd)))); + + { + VersionSet::LogReporter reporter; + reporter.status = &s; + log::Reader reader(nullptr, std::move(file_reader), &reporter, + true /* checksum */, 0 /* log_number */); + Slice record; + std::string scratch; + while (reader.ReadRecord(&record, &scratch) && s.ok()) { + VersionEdit edit; + s = edit.DecodeFrom(record); + if (!s.ok()) { + break; + } + + // Write out each individual edit + if (verbose && !json) { + printf("%s\n", edit.DebugString(hex).c_str()); + } else if (json) { + printf("%s\n", edit.DebugJSON(count, hex).c_str()); + } + count++; + + bool cf_in_builders = + builders.find(edit.column_family_) != builders.end(); + + if (edit.has_comparator_) { + comparators.insert({edit.column_family_, edit.comparator_}); + } + + ColumnFamilyData* cfd = nullptr; + + if (edit.is_column_family_add_) { + if (cf_in_builders) { + s = Status::Corruption( + "Manifest adding the same column family twice"); + break; + } + cfd = CreateColumnFamily(ColumnFamilyOptions(options), &edit); + cfd->set_initialized(); + builders.insert(std::make_pair( + edit.column_family_, std::unique_ptr( + new BaseReferencedVersionBuilder(cfd)))); + } else if (edit.is_column_family_drop_) { + if (!cf_in_builders) { + s = Status::Corruption( + "Manifest - dropping non-existing column family"); + break; + } + auto builder_iter = builders.find(edit.column_family_); + builders.erase(builder_iter); + comparators.erase(edit.column_family_); + cfd = column_family_set_->GetColumnFamily(edit.column_family_); + assert(cfd != nullptr); + cfd->UnrefAndTryDelete(); + cfd = nullptr; + } else { + if (!cf_in_builders) { + s = Status::Corruption( + "Manifest record referencing unknown column family"); + break; + } + + cfd = column_family_set_->GetColumnFamily(edit.column_family_); + // this should never happen since cf_in_builders is true + assert(cfd != nullptr); + + // if it is not column family add or column family drop, + // then it's a file add/delete, which should be forwarded + // to builder + auto builder = builders.find(edit.column_family_); + assert(builder != builders.end()); + s = builder->second->version_builder()->Apply(&edit); + if (!s.ok()) { + break; + } + } + + if (cfd != nullptr && edit.has_log_number_) { + cfd->SetLogNumber(edit.log_number_); + } + + + if (edit.has_prev_log_number_) { + previous_log_number = edit.prev_log_number_; + have_prev_log_number = true; + } + + if (edit.has_next_file_number_) { + next_file = edit.next_file_number_; + have_next_file = true; + } + + if (edit.has_last_sequence_) { + last_sequence = edit.last_sequence_; + have_last_sequence = true; + } + + if (edit.has_max_column_family_) { + column_family_set_->UpdateMaxColumnFamily(edit.max_column_family_); + } + + if (edit.has_min_log_number_to_keep_) { + MarkMinLogNumberToKeep2PC(edit.min_log_number_to_keep_); + } + } + } + file_reader.reset(); + + if (s.ok()) { + if (!have_next_file) { + s = Status::Corruption("no meta-nextfile entry in descriptor"); + printf("no meta-nextfile entry in descriptor"); + } else if (!have_last_sequence) { + printf("no last-sequence-number entry in descriptor"); + s = Status::Corruption("no last-sequence-number entry in descriptor"); + } + + if (!have_prev_log_number) { + previous_log_number = 0; + } + } + + if (s.ok()) { + for (auto cfd : *column_family_set_) { + if (cfd->IsDropped()) { + continue; + } + auto builders_iter = builders.find(cfd->GetID()); + assert(builders_iter != builders.end()); + auto builder = builders_iter->second->version_builder(); + + Version* v = new Version(cfd, this, file_options_, + *cfd->GetLatestMutableCFOptions(), + current_version_number_++); + builder->SaveTo(v->storage_info()); + v->PrepareApply(*cfd->GetLatestMutableCFOptions(), false); + + printf("--------------- Column family \"%s\" (ID %" PRIu32 + ") --------------\n", + cfd->GetName().c_str(), cfd->GetID()); + printf("log number: %" PRIu64 "\n", cfd->GetLogNumber()); + auto comparator = comparators.find(cfd->GetID()); + if (comparator != comparators.end()) { + printf("comparator: %s\n", comparator->second.c_str()); + } else { + printf("comparator: \n"); + } + printf("%s \n", v->DebugString(hex).c_str()); + delete v; + } + + next_file_number_.store(next_file + 1); + last_allocated_sequence_ = last_sequence; + last_published_sequence_ = last_sequence; + last_sequence_ = last_sequence; + prev_log_number_ = previous_log_number; + + printf("next_file_number %" PRIu64 " last_sequence %" PRIu64 + " prev_log_number %" PRIu64 " max_column_family %" PRIu32 + " min_log_number_to_keep " + "%" PRIu64 "\n", + next_file_number_.load(), last_sequence, previous_log_number, + column_family_set_->GetMaxColumnFamily(), + min_log_number_to_keep_2pc()); + } + + return s; +} +#endif // ROCKSDB_LITE + +void VersionSet::MarkFileNumberUsed(uint64_t number) { + // only called during recovery and repair which are single threaded, so this + // works because there can't be concurrent calls + if (next_file_number_.load(std::memory_order_relaxed) <= number) { + next_file_number_.store(number + 1, std::memory_order_relaxed); + } +} +// Called only either from ::LogAndApply which is protected by mutex or during +// recovery which is single-threaded. +void VersionSet::MarkMinLogNumberToKeep2PC(uint64_t number) { + if (min_log_number_to_keep_2pc_.load(std::memory_order_relaxed) < number) { + min_log_number_to_keep_2pc_.store(number, std::memory_order_relaxed); + } +} + +Status VersionSet::WriteCurrentStateToManifest( + const std::unordered_map& curr_state, + log::Writer* log) { + // TODO: Break up into multiple records to reduce memory usage on recovery? + + // WARNING: This method doesn't hold a mutex!! + + // This is done without DB mutex lock held, but only within single-threaded + // LogAndApply. Column family manipulations can only happen within LogAndApply + // (the same single thread), so we're safe to iterate. + + if (db_options_->write_dbid_to_manifest) { + VersionEdit edit_for_db_id; + assert(!db_id_.empty()); + edit_for_db_id.SetDBId(db_id_); + std::string db_id_record; + if (!edit_for_db_id.EncodeTo(&db_id_record)) { + return Status::Corruption("Unable to Encode VersionEdit:" + + edit_for_db_id.DebugString(true)); + } + Status add_record = log->AddRecord(db_id_record); + if (!add_record.ok()) { + return add_record; + } + } + + for (auto cfd : *column_family_set_) { + if (cfd->IsDropped()) { + continue; + } + assert(cfd->initialized()); + { + // Store column family info + VersionEdit edit; + if (cfd->GetID() != 0) { + // default column family is always there, + // no need to explicitly write it + edit.AddColumnFamily(cfd->GetName()); + edit.SetColumnFamily(cfd->GetID()); + } + edit.SetComparatorName( + cfd->internal_comparator().user_comparator()->Name()); + std::string record; + if (!edit.EncodeTo(&record)) { + return Status::Corruption( + "Unable to Encode VersionEdit:" + edit.DebugString(true)); + } + Status s = log->AddRecord(record); + if (!s.ok()) { + return s; + } + } + + { + // Save files + VersionEdit edit; + edit.SetColumnFamily(cfd->GetID()); + + for (int level = 0; level < cfd->NumberLevels(); level++) { + for (const auto& f : + cfd->current()->storage_info()->LevelFiles(level)) { + edit.AddFile(level, f->fd.GetNumber(), f->fd.GetPathId(), + f->fd.GetFileSize(), f->smallest, f->largest, + f->fd.smallest_seqno, f->fd.largest_seqno, + f->marked_for_compaction, f->oldest_blob_file_number, + f->oldest_ancester_time, f->file_creation_time, + f->file_checksum, f->file_checksum_func_name); + } + } + const auto iter = curr_state.find(cfd->GetID()); + assert(iter != curr_state.end()); + uint64_t log_number = iter->second.log_number; + edit.SetLogNumber(log_number); + std::string record; + if (!edit.EncodeTo(&record)) { + return Status::Corruption( + "Unable to Encode VersionEdit:" + edit.DebugString(true)); + } + Status s = log->AddRecord(record); + if (!s.ok()) { + return s; + } + } + } + return Status::OK(); +} + +// TODO(aekmekji): in CompactionJob::GenSubcompactionBoundaries(), this +// function is called repeatedly with consecutive pairs of slices. For example +// if the slice list is [a, b, c, d] this function is called with arguments +// (a,b) then (b,c) then (c,d). Knowing this, an optimization is possible where +// we avoid doing binary search for the keys b and c twice and instead somehow +// maintain state of where they first appear in the files. +uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options, + Version* v, const Slice& start, + const Slice& end, int start_level, + int end_level, TableReaderCaller caller) { + const auto& icmp = v->cfd_->internal_comparator(); + + // pre-condition + assert(icmp.Compare(start, end) <= 0); + + uint64_t total_full_size = 0; + const auto* vstorage = v->storage_info(); + const int num_non_empty_levels = vstorage->num_non_empty_levels(); + end_level = (end_level == -1) ? num_non_empty_levels + : std::min(end_level, num_non_empty_levels); + + assert(start_level <= end_level); + + // Outline of the optimization that uses options.files_size_error_margin. + // When approximating the files total size that is used to store a keys range, + // we first sum up the sizes of the files that fully fall into the range. + // Then we sum up the sizes of all the files that may intersect with the range + // (this includes all files in L0 as well). Then, if total_intersecting_size + // is smaller than total_full_size * options.files_size_error_margin - we can + // infer that the intersecting files have a sufficiently negligible + // contribution to the total size, and we can approximate the storage required + // for the keys in range as just half of the intersecting_files_size. + // E.g., if the value of files_size_error_margin is 0.1, then the error of the + // approximation is limited to only ~10% of the total size of files that fully + // fall into the keys range. In such case, this helps to avoid a costly + // process of binary searching the intersecting files that is required only + // for a more precise calculation of the total size. + + autovector first_files; + autovector last_files; + + // scan all the levels + for (int level = start_level; level < end_level; ++level) { + const LevelFilesBrief& files_brief = vstorage->LevelFilesBrief(level); + if (files_brief.num_files == 0) { + // empty level, skip exploration + continue; + } + + if (level == 0) { + // level 0 files are not in sorted order, we need to iterate through + // the list to compute the total bytes that require scanning, + // so handle the case explicitly (similarly to first_files case) + for (size_t i = 0; i < files_brief.num_files; i++) { + first_files.push_back(&files_brief.files[i]); + } + continue; + } + + assert(level > 0); + assert(files_brief.num_files > 0); + + // identify the file position for start key + const int idx_start = + FindFileInRange(icmp, files_brief, start, 0, + static_cast(files_brief.num_files - 1)); + assert(static_cast(idx_start) < files_brief.num_files); + + // identify the file position for end key + int idx_end = idx_start; + if (icmp.Compare(files_brief.files[idx_end].largest_key, end) < 0) { + idx_end = + FindFileInRange(icmp, files_brief, end, idx_start, + static_cast(files_brief.num_files - 1)); + } + assert(idx_end >= idx_start && + static_cast(idx_end) < files_brief.num_files); + + // scan all files from the starting index to the ending index + // (inferred from the sorted order) + + // first scan all the intermediate full files (excluding first and last) + for (int i = idx_start + 1; i < idx_end; ++i) { + uint64_t file_size = files_brief.files[i].fd.GetFileSize(); + // The entire file falls into the range, so we can just take its size. + assert(file_size == + ApproximateSize(v, files_brief.files[i], start, end, caller)); + total_full_size += file_size; + } + + // save the first and the last files (which may be the same file), so we + // can scan them later. + first_files.push_back(&files_brief.files[idx_start]); + if (idx_start != idx_end) { + // we need to estimate size for both files, only if they are different + last_files.push_back(&files_brief.files[idx_end]); + } + } + + // The sum of all file sizes that intersect the [start, end] keys range. + uint64_t total_intersecting_size = 0; + for (const auto* file_ptr : first_files) { + total_intersecting_size += file_ptr->fd.GetFileSize(); + } + for (const auto* file_ptr : last_files) { + total_intersecting_size += file_ptr->fd.GetFileSize(); + } + + // Now scan all the first & last files at each level, and estimate their size. + // If the total_intersecting_size is less than X% of the total_full_size - we + // want to approximate the result in order to avoid the costly binary search + // inside ApproximateSize. We use half of file size as an approximation below. + + const double margin = options.files_size_error_margin; + if (margin > 0 && total_intersecting_size < + static_cast(total_full_size * margin)) { + total_full_size += total_intersecting_size / 2; + } else { + // Estimate for all the first files, at each level + for (const auto file_ptr : first_files) { + total_full_size += ApproximateSize(v, *file_ptr, start, end, caller); + } + + // Estimate for all the last files, at each level + for (const auto file_ptr : last_files) { + // We could use ApproximateSize here, but calling ApproximateOffsetOf + // directly is just more efficient. + total_full_size += ApproximateOffsetOf(v, *file_ptr, end, caller); + } + } + + return total_full_size; +} + +uint64_t VersionSet::ApproximateOffsetOf(Version* v, const FdWithKeyRange& f, + const Slice& key, + TableReaderCaller caller) { + // pre-condition + assert(v); + const auto& icmp = v->cfd_->internal_comparator(); + + uint64_t result = 0; + if (icmp.Compare(f.largest_key, key) <= 0) { + // Entire file is before "key", so just add the file size + result = f.fd.GetFileSize(); + } else if (icmp.Compare(f.smallest_key, key) > 0) { + // Entire file is after "key", so ignore + result = 0; + } else { + // "key" falls in the range for this table. Add the + // approximate offset of "key" within the table. + TableCache* table_cache = v->cfd_->table_cache(); + if (table_cache != nullptr) { + result = table_cache->ApproximateOffsetOf( + key, f.file_metadata->fd, caller, icmp, + v->GetMutableCFOptions().prefix_extractor.get()); + } + } + return result; +} + +uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f, + const Slice& start, const Slice& end, + TableReaderCaller caller) { + // pre-condition + assert(v); + const auto& icmp = v->cfd_->internal_comparator(); + assert(icmp.Compare(start, end) <= 0); + + if (icmp.Compare(f.largest_key, start) <= 0 || + icmp.Compare(f.smallest_key, end) > 0) { + // Entire file is before or after the start/end keys range + return 0; + } + + if (icmp.Compare(f.smallest_key, start) >= 0) { + // Start of the range is before the file start - approximate by end offset + return ApproximateOffsetOf(v, f, end, caller); + } + + if (icmp.Compare(f.largest_key, end) < 0) { + // End of the range is after the file end - approximate by subtracting + // start offset from the file size + uint64_t start_offset = ApproximateOffsetOf(v, f, start, caller); + assert(f.fd.GetFileSize() >= start_offset); + return f.fd.GetFileSize() - start_offset; + } + + // The interval falls entirely in the range for this file. + TableCache* table_cache = v->cfd_->table_cache(); + if (table_cache == nullptr) { + return 0; + } + return table_cache->ApproximateSize( + start, end, f.file_metadata->fd, caller, icmp, + v->GetMutableCFOptions().prefix_extractor.get()); +} + +void VersionSet::AddLiveFiles(std::vector* live_list) { + // pre-calculate space requirement + int64_t total_files = 0; + for (auto cfd : *column_family_set_) { + if (!cfd->initialized()) { + continue; + } + Version* dummy_versions = cfd->dummy_versions(); + for (Version* v = dummy_versions->next_; v != dummy_versions; + v = v->next_) { + const auto* vstorage = v->storage_info(); + for (int level = 0; level < vstorage->num_levels(); level++) { + total_files += vstorage->LevelFiles(level).size(); + } + } + } + + // just one time extension to the right size + live_list->reserve(live_list->size() + static_cast(total_files)); + + for (auto cfd : *column_family_set_) { + if (!cfd->initialized()) { + continue; + } + auto* current = cfd->current(); + bool found_current = false; + Version* dummy_versions = cfd->dummy_versions(); + for (Version* v = dummy_versions->next_; v != dummy_versions; + v = v->next_) { + v->AddLiveFiles(live_list); + if (v == current) { + found_current = true; + } + } + if (!found_current && current != nullptr) { + // Should never happen unless it is a bug. + assert(false); + current->AddLiveFiles(live_list); + } + } +} + +InternalIterator* VersionSet::MakeInputIterator( + const Compaction* c, RangeDelAggregator* range_del_agg, + const FileOptions& file_options_compactions) { + auto cfd = c->column_family_data(); + ReadOptions read_options; + read_options.verify_checksums = true; + read_options.fill_cache = false; + // Compaction iterators shouldn't be confined to a single prefix. + // Compactions use Seek() for + // (a) concurrent compactions, + // (b) CompactionFilter::Decision::kRemoveAndSkipUntil. + read_options.total_order_seek = true; + + // Level-0 files have to be merged together. For other levels, + // we will make a concatenating iterator per level. + // TODO(opt): use concatenating iterator for level-0 if there is no overlap + const size_t space = (c->level() == 0 ? c->input_levels(0)->num_files + + c->num_input_levels() - 1 + : c->num_input_levels()); + InternalIterator** list = new InternalIterator* [space]; + size_t num = 0; + for (size_t which = 0; which < c->num_input_levels(); which++) { + if (c->input_levels(which)->num_files != 0) { + if (c->level(which) == 0) { + const LevelFilesBrief* flevel = c->input_levels(which); + for (size_t i = 0; i < flevel->num_files; i++) { + list[num++] = cfd->table_cache()->NewIterator( + read_options, file_options_compactions, + cfd->internal_comparator(), + *flevel->files[i].file_metadata, range_del_agg, + c->mutable_cf_options()->prefix_extractor.get(), + /*table_reader_ptr=*/nullptr, + /*file_read_hist=*/nullptr, TableReaderCaller::kCompaction, + /*arena=*/nullptr, + /*skip_filters=*/false, /*level=*/static_cast(which), + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr); + } + } else { + // Create concatenating iterator for the files from this level + list[num++] = new LevelIterator( + cfd->table_cache(), read_options, file_options_compactions, + cfd->internal_comparator(), c->input_levels(which), + c->mutable_cf_options()->prefix_extractor.get(), + /*should_sample=*/false, + /*no per level latency histogram=*/nullptr, + TableReaderCaller::kCompaction, /*skip_filters=*/false, + /*level=*/static_cast(which), range_del_agg, + c->boundaries(which)); + } + } + } + assert(num <= space); + InternalIterator* result = + NewMergingIterator(&c->column_family_data()->internal_comparator(), list, + static_cast(num)); + delete[] list; + return result; +} + +// verify that the files listed in this compaction are present +// in the current version +bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) { +#ifndef NDEBUG + Version* version = c->column_family_data()->current(); + const VersionStorageInfo* vstorage = version->storage_info(); + if (c->input_version() != version) { + ROCKS_LOG_INFO( + db_options_->info_log, + "[%s] compaction output being applied to a different base version from" + " input version", + c->column_family_data()->GetName().c_str()); + + if (vstorage->compaction_style_ == kCompactionStyleLevel && + c->start_level() == 0 && c->num_input_levels() > 2U) { + // We are doing a L0->base_level compaction. The assumption is if + // base level is not L1, levels from L1 to base_level - 1 is empty. + // This is ensured by having one compaction from L0 going on at the + // same time in level-based compaction. So that during the time, no + // compaction/flush can put files to those levels. + for (int l = c->start_level() + 1; l < c->output_level(); l++) { + if (vstorage->NumLevelFiles(l) != 0) { + return false; + } + } + } + } + + for (size_t input = 0; input < c->num_input_levels(); ++input) { + int level = c->level(input); + for (size_t i = 0; i < c->num_input_files(input); ++i) { + uint64_t number = c->input(input, i)->fd.GetNumber(); + bool found = false; + for (size_t j = 0; j < vstorage->files_[level].size(); j++) { + FileMetaData* f = vstorage->files_[level][j]; + if (f->fd.GetNumber() == number) { + found = true; + break; + } + } + if (!found) { + return false; // input files non existent in current version + } + } + } +#else + (void)c; +#endif + return true; // everything good +} + +Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel, + FileMetaData** meta, + ColumnFamilyData** cfd) { + for (auto cfd_iter : *column_family_set_) { + if (!cfd_iter->initialized()) { + continue; + } + Version* version = cfd_iter->current(); + const auto* vstorage = version->storage_info(); + for (int level = 0; level < vstorage->num_levels(); level++) { + for (const auto& file : vstorage->LevelFiles(level)) { + if (file->fd.GetNumber() == number) { + *meta = file; + *filelevel = level; + *cfd = cfd_iter; + return Status::OK(); + } + } + } + } + return Status::NotFound("File not present in any level"); +} + +void VersionSet::GetLiveFilesMetaData(std::vector* metadata) { + for (auto cfd : *column_family_set_) { + if (cfd->IsDropped() || !cfd->initialized()) { + continue; + } + for (int level = 0; level < cfd->NumberLevels(); level++) { + for (const auto& file : + cfd->current()->storage_info()->LevelFiles(level)) { + LiveFileMetaData filemetadata; + filemetadata.column_family_name = cfd->GetName(); + uint32_t path_id = file->fd.GetPathId(); + if (path_id < cfd->ioptions()->cf_paths.size()) { + filemetadata.db_path = cfd->ioptions()->cf_paths[path_id].path; + } else { + assert(!cfd->ioptions()->cf_paths.empty()); + filemetadata.db_path = cfd->ioptions()->cf_paths.back().path; + } + const uint64_t file_number = file->fd.GetNumber(); + filemetadata.name = MakeTableFileName("", file_number); + filemetadata.file_number = file_number; + filemetadata.level = level; + filemetadata.size = static_cast(file->fd.GetFileSize()); + filemetadata.smallestkey = file->smallest.user_key().ToString(); + filemetadata.largestkey = file->largest.user_key().ToString(); + filemetadata.smallest_seqno = file->fd.smallest_seqno; + filemetadata.largest_seqno = file->fd.largest_seqno; + filemetadata.num_reads_sampled = file->stats.num_reads_sampled.load( + std::memory_order_relaxed); + filemetadata.being_compacted = file->being_compacted; + filemetadata.num_entries = file->num_entries; + filemetadata.num_deletions = file->num_deletions; + filemetadata.oldest_blob_file_number = file->oldest_blob_file_number; + filemetadata.file_checksum = file->file_checksum; + filemetadata.file_checksum_func_name = file->file_checksum_func_name; + metadata->push_back(filemetadata); + } + } + } +} + +void VersionSet::GetObsoleteFiles(std::vector* files, + std::vector* manifest_filenames, + uint64_t min_pending_output) { + assert(manifest_filenames->empty()); + obsolete_manifests_.swap(*manifest_filenames); + std::vector pending_files; + for (auto& f : obsolete_files_) { + if (f.metadata->fd.GetNumber() < min_pending_output) { + files->push_back(std::move(f)); + } else { + pending_files.push_back(std::move(f)); + } + } + obsolete_files_.swap(pending_files); +} + +ColumnFamilyData* VersionSet::CreateColumnFamily( + const ColumnFamilyOptions& cf_options, VersionEdit* edit) { + assert(edit->is_column_family_add_); + + MutableCFOptions dummy_cf_options; + Version* dummy_versions = + new Version(nullptr, this, file_options_, dummy_cf_options); + // Ref() dummy version once so that later we can call Unref() to delete it + // by avoiding calling "delete" explicitly (~Version is private) + dummy_versions->Ref(); + auto new_cfd = column_family_set_->CreateColumnFamily( + edit->column_family_name_, edit->column_family_, dummy_versions, + cf_options); + + Version* v = new Version(new_cfd, this, file_options_, + *new_cfd->GetLatestMutableCFOptions(), + current_version_number_++); + + // Fill level target base information. + v->storage_info()->CalculateBaseBytes(*new_cfd->ioptions(), + *new_cfd->GetLatestMutableCFOptions()); + AppendVersion(new_cfd, v); + // GetLatestMutableCFOptions() is safe here without mutex since the + // cfd is not available to client + new_cfd->CreateNewMemtable(*new_cfd->GetLatestMutableCFOptions(), + LastSequence()); + new_cfd->SetLogNumber(edit->log_number_); + return new_cfd; +} + +uint64_t VersionSet::GetNumLiveVersions(Version* dummy_versions) { + uint64_t count = 0; + for (Version* v = dummy_versions->next_; v != dummy_versions; v = v->next_) { + count++; + } + return count; +} + +uint64_t VersionSet::GetTotalSstFilesSize(Version* dummy_versions) { + std::unordered_set unique_files; + uint64_t total_files_size = 0; + for (Version* v = dummy_versions->next_; v != dummy_versions; v = v->next_) { + VersionStorageInfo* storage_info = v->storage_info(); + for (int level = 0; level < storage_info->num_levels_; level++) { + for (const auto& file_meta : storage_info->LevelFiles(level)) { + if (unique_files.find(file_meta->fd.packed_number_and_path_id) == + unique_files.end()) { + unique_files.insert(file_meta->fd.packed_number_and_path_id); + total_files_size += file_meta->fd.GetFileSize(); + } + } + } + } + return total_files_size; +} + +ReactiveVersionSet::ReactiveVersionSet(const std::string& dbname, + const ImmutableDBOptions* _db_options, + const FileOptions& _file_options, + Cache* table_cache, + WriteBufferManager* write_buffer_manager, + WriteController* write_controller) + : VersionSet(dbname, _db_options, _file_options, table_cache, + write_buffer_manager, write_controller, + /*block_cache_tracer=*/nullptr), + number_of_edits_to_skip_(0) {} + +ReactiveVersionSet::~ReactiveVersionSet() {} + +Status ReactiveVersionSet::Recover( + const std::vector& column_families, + std::unique_ptr* manifest_reader, + std::unique_ptr* manifest_reporter, + std::unique_ptr* manifest_reader_status) { + assert(manifest_reader != nullptr); + assert(manifest_reporter != nullptr); + assert(manifest_reader_status != nullptr); + + std::unordered_map cf_name_to_options; + for (const auto& cf : column_families) { + cf_name_to_options.insert({cf.name, cf.options}); + } + + // add default column family + auto default_cf_iter = cf_name_to_options.find(kDefaultColumnFamilyName); + if (default_cf_iter == cf_name_to_options.end()) { + return Status::InvalidArgument("Default column family not specified"); + } + VersionEdit default_cf_edit; + default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName); + default_cf_edit.SetColumnFamily(0); + ColumnFamilyData* default_cfd = + CreateColumnFamily(default_cf_iter->second, &default_cf_edit); + // In recovery, nobody else can access it, so it's fine to set it to be + // initialized earlier. + default_cfd->set_initialized(); + std::unordered_map> + builders; + std::unordered_map column_families_not_found; + builders.insert( + std::make_pair(0, std::unique_ptr( + new BaseReferencedVersionBuilder(default_cfd)))); + + manifest_reader_status->reset(new Status()); + manifest_reporter->reset(new LogReporter()); + static_cast(manifest_reporter->get())->status = + manifest_reader_status->get(); + Status s = MaybeSwitchManifest(manifest_reporter->get(), manifest_reader); + log::Reader* reader = manifest_reader->get(); + + int retry = 0; + VersionEdit version_edit; + while (s.ok() && retry < 1) { + assert(reader != nullptr); + Slice record; + std::string scratch; + s = ReadAndRecover(reader, &read_buffer_, cf_name_to_options, + column_families_not_found, builders, &version_edit); + if (s.ok()) { + bool enough = version_edit.has_next_file_number_ && + version_edit.has_log_number_ && + version_edit.has_last_sequence_; + if (enough) { + for (const auto& cf : column_families) { + auto cfd = column_family_set_->GetColumnFamily(cf.name); + if (cfd == nullptr) { + enough = false; + break; + } + } + } + if (enough) { + for (const auto& cf : column_families) { + auto cfd = column_family_set_->GetColumnFamily(cf.name); + assert(cfd != nullptr); + if (!cfd->IsDropped()) { + auto builder_iter = builders.find(cfd->GetID()); + assert(builder_iter != builders.end()); + auto builder = builder_iter->second->version_builder(); + assert(builder != nullptr); + s = builder->LoadTableHandlers( + cfd->internal_stats(), db_options_->max_file_opening_threads, + false /* prefetch_index_and_filter_in_cache */, + true /* is_initial_load */, + cfd->GetLatestMutableCFOptions()->prefix_extractor.get()); + if (!s.ok()) { + enough = false; + if (s.IsPathNotFound()) { + s = Status::OK(); + } + break; + } + } + } + } + if (enough) { + break; + } + } + ++retry; + } + + if (s.ok()) { + if (!version_edit.has_prev_log_number_) { + version_edit.prev_log_number_ = 0; + } + column_family_set_->UpdateMaxColumnFamily(version_edit.max_column_family_); + + MarkMinLogNumberToKeep2PC(version_edit.min_log_number_to_keep_); + MarkFileNumberUsed(version_edit.prev_log_number_); + MarkFileNumberUsed(version_edit.log_number_); + + for (auto cfd : *column_family_set_) { + assert(builders.count(cfd->GetID()) > 0); + auto builder = builders[cfd->GetID()]->version_builder(); + if (!builder->CheckConsistencyForNumLevels()) { + s = Status::InvalidArgument( + "db has more levels than options.num_levels"); + break; + } + } + } + + if (s.ok()) { + for (auto cfd : *column_family_set_) { + if (cfd->IsDropped()) { + continue; + } + assert(cfd->initialized()); + auto builders_iter = builders.find(cfd->GetID()); + assert(builders_iter != builders.end()); + auto* builder = builders_iter->second->version_builder(); + + Version* v = new Version(cfd, this, file_options_, + *cfd->GetLatestMutableCFOptions(), + current_version_number_++); + builder->SaveTo(v->storage_info()); + + // Install recovered version + v->PrepareApply(*cfd->GetLatestMutableCFOptions(), + !(db_options_->skip_stats_update_on_db_open)); + AppendVersion(cfd, v); + } + next_file_number_.store(version_edit.next_file_number_ + 1); + last_allocated_sequence_ = version_edit.last_sequence_; + last_published_sequence_ = version_edit.last_sequence_; + last_sequence_ = version_edit.last_sequence_; + prev_log_number_ = version_edit.prev_log_number_; + for (auto cfd : *column_family_set_) { + if (cfd->IsDropped()) { + continue; + } + ROCKS_LOG_INFO(db_options_->info_log, + "Column family [%s] (ID %u), log number is %" PRIu64 "\n", + cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber()); + } + } + return s; +} + +Status ReactiveVersionSet::ReadAndApply( + InstrumentedMutex* mu, + std::unique_ptr* manifest_reader, + std::unordered_set* cfds_changed) { + assert(manifest_reader != nullptr); + assert(cfds_changed != nullptr); + mu->AssertHeld(); + + Status s; + uint64_t applied_edits = 0; + while (s.ok()) { + Slice record; + std::string scratch; + log::Reader* reader = manifest_reader->get(); + std::string old_manifest_path = reader->file()->file_name(); + while (reader->ReadRecord(&record, &scratch)) { + VersionEdit edit; + s = edit.DecodeFrom(record); + if (!s.ok()) { + break; + } + + // Skip the first VersionEdits of each MANIFEST generated by + // VersionSet::WriteCurrentStatetoManifest. + if (number_of_edits_to_skip_ > 0) { + ColumnFamilyData* cfd = + column_family_set_->GetColumnFamily(edit.column_family_); + if (cfd != nullptr && !cfd->IsDropped()) { + --number_of_edits_to_skip_; + } + continue; + } + + s = read_buffer_.AddEdit(&edit); + if (!s.ok()) { + break; + } + VersionEdit temp_edit; + if (edit.is_in_atomic_group_) { + if (read_buffer_.IsFull()) { + // Apply edits in an atomic group when we have read all edits in the + // group. + for (auto& e : read_buffer_.replay_buffer()) { + s = ApplyOneVersionEditToBuilder(e, cfds_changed, &temp_edit); + if (!s.ok()) { + break; + } + applied_edits++; + } + if (!s.ok()) { + break; + } + read_buffer_.Clear(); + } + } else { + // Apply a normal edit immediately. + s = ApplyOneVersionEditToBuilder(edit, cfds_changed, &temp_edit); + if (s.ok()) { + applied_edits++; + } + } + } + if (!s.ok()) { + // Clear the buffer if we fail to decode/apply an edit. + read_buffer_.Clear(); + } + // It's possible that: + // 1) s.IsCorruption(), indicating the current MANIFEST is corrupted. + // 2) we have finished reading the current MANIFEST. + // 3) we have encountered an IOError reading the current MANIFEST. + // We need to look for the next MANIFEST and start from there. If we cannot + // find the next MANIFEST, we should exit the loop. + s = MaybeSwitchManifest(reader->GetReporter(), manifest_reader); + reader = manifest_reader->get(); + if (s.ok()) { + if (reader->file()->file_name() == old_manifest_path) { + // Still processing the same MANIFEST, thus no need to continue this + // loop since no record is available if we have reached here. + break; + } else { + // We have switched to a new MANIFEST whose first records have been + // generated by VersionSet::WriteCurrentStatetoManifest. Since the + // secondary instance has already finished recovering upon start, there + // is no need for the secondary to process these records. Actually, if + // the secondary were to replay these records, the secondary may end up + // adding the same SST files AGAIN to each column family, causing + // consistency checks done by VersionBuilder to fail. Therefore, we + // record the number of records to skip at the beginning of the new + // MANIFEST and ignore them. + number_of_edits_to_skip_ = 0; + for (auto* cfd : *column_family_set_) { + if (cfd->IsDropped()) { + continue; + } + // Increase number_of_edits_to_skip by 2 because + // WriteCurrentStatetoManifest() writes 2 version edits for each + // column family at the beginning of the newly-generated MANIFEST. + // TODO(yanqin) remove hard-coded value. + if (db_options_->write_dbid_to_manifest) { + number_of_edits_to_skip_ += 3; + } else { + number_of_edits_to_skip_ += 2; + } + } + } + } + } + + if (s.ok()) { + for (auto cfd : *column_family_set_) { + auto builder_iter = active_version_builders_.find(cfd->GetID()); + if (builder_iter == active_version_builders_.end()) { + continue; + } + auto builder = builder_iter->second->version_builder(); + if (!builder->CheckConsistencyForNumLevels()) { + s = Status::InvalidArgument( + "db has more levels than options.num_levels"); + break; + } + } + } + TEST_SYNC_POINT_CALLBACK("ReactiveVersionSet::ReadAndApply:AppliedEdits", + &applied_edits); + return s; +} + +Status ReactiveVersionSet::ApplyOneVersionEditToBuilder( + VersionEdit& edit, std::unordered_set* cfds_changed, + VersionEdit* version_edit) { + ColumnFamilyData* cfd = + column_family_set_->GetColumnFamily(edit.column_family_); + + // If we cannot find this column family in our column family set, then it + // may be a new column family created by the primary after the secondary + // starts. It is also possible that the secondary instance opens only a subset + // of column families. Ignore it for now. + if (nullptr == cfd) { + return Status::OK(); + } + if (active_version_builders_.find(edit.column_family_) == + active_version_builders_.end() && + !cfd->IsDropped()) { + std::unique_ptr builder_guard( + new BaseReferencedVersionBuilder(cfd)); + active_version_builders_.insert( + std::make_pair(edit.column_family_, std::move(builder_guard))); + } + + auto builder_iter = active_version_builders_.find(edit.column_family_); + assert(builder_iter != active_version_builders_.end()); + auto builder = builder_iter->second->version_builder(); + assert(builder != nullptr); + + if (edit.is_column_family_add_) { + // TODO (yanqin) for now the secondary ignores column families created + // after Open. This also simplifies handling of switching to a new MANIFEST + // and processing the snapshot of the system at the beginning of the + // MANIFEST. + } else if (edit.is_column_family_drop_) { + // Drop the column family by setting it to be 'dropped' without destroying + // the column family handle. + // TODO (haoyu) figure out how to handle column faimly drop for + // secondary instance. (Is it possible that the ref count for cfd is 0 but + // the ref count for its versions is higher than 0?) + cfd->SetDropped(); + if (cfd->UnrefAndTryDelete()) { + cfd = nullptr; + } + active_version_builders_.erase(builder_iter); + } else { + Status s = builder->Apply(&edit); + if (!s.ok()) { + return s; + } + } + Status s = ExtractInfoFromVersionEdit(cfd, edit, version_edit); + if (!s.ok()) { + return s; + } + + if (cfd != nullptr && !cfd->IsDropped()) { + s = builder->LoadTableHandlers( + cfd->internal_stats(), db_options_->max_file_opening_threads, + false /* prefetch_index_and_filter_in_cache */, + false /* is_initial_load */, + cfd->GetLatestMutableCFOptions()->prefix_extractor.get()); + TEST_SYNC_POINT_CALLBACK( + "ReactiveVersionSet::ApplyOneVersionEditToBuilder:" + "AfterLoadTableHandlers", + &s); + + if (s.ok()) { + auto version = new Version(cfd, this, file_options_, + *cfd->GetLatestMutableCFOptions(), + current_version_number_++); + builder->SaveTo(version->storage_info()); + version->PrepareApply(*cfd->GetLatestMutableCFOptions(), true); + AppendVersion(cfd, version); + active_version_builders_.erase(builder_iter); + if (cfds_changed->count(cfd) == 0) { + cfds_changed->insert(cfd); + } + } else if (s.IsPathNotFound()) { + s = Status::OK(); + } + // Some other error has occurred during LoadTableHandlers. + } + + if (version_edit->HasNextFile()) { + next_file_number_.store(version_edit->next_file_number_ + 1); + } + if (version_edit->has_last_sequence_) { + last_allocated_sequence_ = version_edit->last_sequence_; + last_published_sequence_ = version_edit->last_sequence_; + last_sequence_ = version_edit->last_sequence_; + } + if (version_edit->has_prev_log_number_) { + prev_log_number_ = version_edit->prev_log_number_; + MarkFileNumberUsed(version_edit->prev_log_number_); + } + if (version_edit->has_log_number_) { + MarkFileNumberUsed(version_edit->log_number_); + } + column_family_set_->UpdateMaxColumnFamily(version_edit->max_column_family_); + MarkMinLogNumberToKeep2PC(version_edit->min_log_number_to_keep_); + return s; +} + +Status ReactiveVersionSet::MaybeSwitchManifest( + log::Reader::Reporter* reporter, + std::unique_ptr* manifest_reader) { + assert(manifest_reader != nullptr); + Status s; + do { + std::string manifest_path; + s = GetCurrentManifestPath(dbname_, fs_, &manifest_path, + &manifest_file_number_); + std::unique_ptr manifest_file; + if (s.ok()) { + if (nullptr == manifest_reader->get() || + manifest_reader->get()->file()->file_name() != manifest_path) { + TEST_SYNC_POINT( + "ReactiveVersionSet::MaybeSwitchManifest:" + "AfterGetCurrentManifestPath:0"); + TEST_SYNC_POINT( + "ReactiveVersionSet::MaybeSwitchManifest:" + "AfterGetCurrentManifestPath:1"); + s = fs_->NewSequentialFile(manifest_path, + env_->OptimizeForManifestRead(file_options_), + &manifest_file, nullptr); + } else { + // No need to switch manifest. + break; + } + } + std::unique_ptr manifest_file_reader; + if (s.ok()) { + manifest_file_reader.reset( + new SequentialFileReader(std::move(manifest_file), manifest_path, + db_options_->log_readahead_size)); + manifest_reader->reset(new log::FragmentBufferedReader( + nullptr, std::move(manifest_file_reader), reporter, + true /* checksum */, 0 /* log_number */)); + ROCKS_LOG_INFO(db_options_->info_log, "Switched to new manifest: %s\n", + manifest_path.c_str()); + // TODO (yanqin) every time we switch to a new MANIFEST, we clear the + // active_version_builders_ map because we choose to construct the + // versions from scratch, thanks to the first part of each MANIFEST + // written by VersionSet::WriteCurrentStatetoManifest. This is not + // necessary, but we choose this at present for the sake of simplicity. + active_version_builders_.clear(); + } + } while (s.IsPathNotFound()); + return s; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/version_set.h b/src/rocksdb/db/version_set.h new file mode 100644 index 000000000..2ab09a5f8 --- /dev/null +++ b/src/rocksdb/db/version_set.h @@ -0,0 +1,1251 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// The representation of a DBImpl consists of a set of Versions. The +// newest version is called "current". Older versions may be kept +// around to provide a consistent view to live iterators. +// +// Each Version keeps track of a set of Table files per level. The +// entire set of versions is maintained in a VersionSet. +// +// Version,VersionSet are thread-compatible, but require external +// synchronization on all accesses. + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/column_family.h" +#include "db/compaction/compaction.h" +#include "db/compaction/compaction_picker.h" +#include "db/dbformat.h" +#include "db/file_indexer.h" +#include "db/log_reader.h" +#include "db/range_del_aggregator.h" +#include "db/read_callback.h" +#include "db/table_cache.h" +#include "db/version_builder.h" +#include "db/version_edit.h" +#include "db/write_controller.h" +#include "monitoring/instrumented_mutex.h" +#include "options/db_options.h" +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/file_checksum.h" +#include "table/get_context.h" +#include "table/multiget_context.h" +#include "trace_replay/block_cache_tracer.h" + +namespace ROCKSDB_NAMESPACE { + +namespace log { +class Writer; +} + +class Compaction; +class LogBuffer; +class LookupKey; +class MemTable; +class Version; +class VersionSet; +class WriteBufferManager; +class MergeContext; +class ColumnFamilySet; +class MergeIteratorBuilder; + +// VersionEdit is always supposed to be valid and it is used to point at +// entries in Manifest. Ideally it should not be used as a container to +// carry around few of its fields as function params because it can cause +// readers to think it's a valid entry from Manifest. To avoid that confusion +// introducing VersionEditParams to simply carry around multiple VersionEdit +// params. It need not point to a valid record in Manifest. +using VersionEditParams = VersionEdit; + +// Return the smallest index i such that file_level.files[i]->largest >= key. +// Return file_level.num_files if there is no such file. +// REQUIRES: "file_level.files" contains a sorted list of +// non-overlapping files. +extern int FindFile(const InternalKeyComparator& icmp, + const LevelFilesBrief& file_level, const Slice& key); + +// Returns true iff some file in "files" overlaps the user key range +// [*smallest,*largest]. +// smallest==nullptr represents a key smaller than all keys in the DB. +// largest==nullptr represents a key largest than all keys in the DB. +// REQUIRES: If disjoint_sorted_files, file_level.files[] +// contains disjoint ranges in sorted order. +extern bool SomeFileOverlapsRange(const InternalKeyComparator& icmp, + bool disjoint_sorted_files, + const LevelFilesBrief& file_level, + const Slice* smallest_user_key, + const Slice* largest_user_key); + +// Generate LevelFilesBrief from vector +// Would copy smallest_key and largest_key data to sequential memory +// arena: Arena used to allocate the memory +extern void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level, + const std::vector& files, + Arena* arena); + +// Information of the storage associated with each Version, including number of +// levels of LSM tree, files information at each level, files marked for +// compaction, etc. +class VersionStorageInfo { + public: + VersionStorageInfo(const InternalKeyComparator* internal_comparator, + const Comparator* user_comparator, int num_levels, + CompactionStyle compaction_style, + VersionStorageInfo* src_vstorage, + bool _force_consistency_checks); + // No copying allowed + VersionStorageInfo(const VersionStorageInfo&) = delete; + void operator=(const VersionStorageInfo&) = delete; + ~VersionStorageInfo(); + + void Reserve(int level, size_t size) { files_[level].reserve(size); } + + void AddFile(int level, FileMetaData* f, Logger* info_log = nullptr); + + void SetFinalized(); + + // Update num_non_empty_levels_. + void UpdateNumNonEmptyLevels(); + + void GenerateFileIndexer() { + file_indexer_.UpdateIndex(&arena_, num_non_empty_levels_, files_); + } + + // Update the accumulated stats from a file-meta. + void UpdateAccumulatedStats(FileMetaData* file_meta); + + // Decrease the current stat from a to-be-deleted file-meta + void RemoveCurrentStats(FileMetaData* file_meta); + + void ComputeCompensatedSizes(); + + // Updates internal structures that keep track of compaction scores + // We use compaction scores to figure out which compaction to do next + // REQUIRES: db_mutex held!! + // TODO find a better way to pass compaction_options_fifo. + void ComputeCompactionScore(const ImmutableCFOptions& immutable_cf_options, + const MutableCFOptions& mutable_cf_options); + + // Estimate est_comp_needed_bytes_ + void EstimateCompactionBytesNeeded( + const MutableCFOptions& mutable_cf_options); + + // This computes files_marked_for_compaction_ and is called by + // ComputeCompactionScore() + void ComputeFilesMarkedForCompaction(); + + // This computes ttl_expired_files_ and is called by + // ComputeCompactionScore() + void ComputeExpiredTtlFiles(const ImmutableCFOptions& ioptions, + const uint64_t ttl); + + // This computes files_marked_for_periodic_compaction_ and is called by + // ComputeCompactionScore() + void ComputeFilesMarkedForPeriodicCompaction( + const ImmutableCFOptions& ioptions, + const uint64_t periodic_compaction_seconds); + + // This computes bottommost_files_marked_for_compaction_ and is called by + // ComputeCompactionScore() or UpdateOldestSnapshot(). + // + // Among bottommost files (assumes they've already been computed), marks the + // ones that have keys that would be eliminated if recompacted, according to + // the seqnum of the oldest existing snapshot. Must be called every time + // oldest snapshot changes as that is when bottom-level files can become + // eligible for compaction. + // + // REQUIRES: DB mutex held + void ComputeBottommostFilesMarkedForCompaction(); + + // Generate level_files_brief_ from files_ + void GenerateLevelFilesBrief(); + // Sort all files for this version based on their file size and + // record results in files_by_compaction_pri_. The largest files are listed + // first. + void UpdateFilesByCompactionPri(CompactionPri compaction_pri); + + void GenerateLevel0NonOverlapping(); + bool level0_non_overlapping() const { + return level0_non_overlapping_; + } + + // Check whether each file in this version is bottommost (i.e., nothing in its + // key-range could possibly exist in an older file/level). + // REQUIRES: This version has not been saved + void GenerateBottommostFiles(); + + // Updates the oldest snapshot and related internal state, like the bottommost + // files marked for compaction. + // REQUIRES: DB mutex held + void UpdateOldestSnapshot(SequenceNumber oldest_snapshot_seqnum); + + int MaxInputLevel() const; + int MaxOutputLevel(bool allow_ingest_behind) const; + + // Return level number that has idx'th highest score + int CompactionScoreLevel(int idx) const { return compaction_level_[idx]; } + + // Return idx'th highest score + double CompactionScore(int idx) const { return compaction_score_[idx]; } + + void GetOverlappingInputs( + int level, const InternalKey* begin, // nullptr means before all keys + const InternalKey* end, // nullptr means after all keys + std::vector* inputs, + int hint_index = -1, // index of overlap file + int* file_index = nullptr, // return index of overlap file + bool expand_range = true, // if set, returns files which overlap the + // range and overlap each other. If false, + // then just files intersecting the range + InternalKey** next_smallest = nullptr) // if non-null, returns the + const; // smallest key of next file not included + void GetCleanInputsWithinInterval( + int level, const InternalKey* begin, // nullptr means before all keys + const InternalKey* end, // nullptr means after all keys + std::vector* inputs, + int hint_index = -1, // index of overlap file + int* file_index = nullptr) // return index of overlap file + const; + + void GetOverlappingInputsRangeBinarySearch( + int level, // level > 0 + const InternalKey* begin, // nullptr means before all keys + const InternalKey* end, // nullptr means after all keys + std::vector* inputs, + int hint_index, // index of overlap file + int* file_index, // return index of overlap file + bool within_interval = false, // if set, force the inputs within interval + InternalKey** next_smallest = nullptr) // if non-null, returns the + const; // smallest key of next file not included + + // Returns true iff some file in the specified level overlaps + // some part of [*smallest_user_key,*largest_user_key]. + // smallest_user_key==NULL represents a key smaller than all keys in the DB. + // largest_user_key==NULL represents a key largest than all keys in the DB. + bool OverlapInLevel(int level, const Slice* smallest_user_key, + const Slice* largest_user_key); + + // Returns true iff the first or last file in inputs contains + // an overlapping user key to the file "just outside" of it (i.e. + // just after the last file, or just before the first file) + // REQUIRES: "*inputs" is a sorted list of non-overlapping files + bool HasOverlappingUserKey(const std::vector* inputs, + int level); + + int num_levels() const { return num_levels_; } + + // REQUIRES: This version has been saved (see VersionSet::SaveTo) + int num_non_empty_levels() const { + assert(finalized_); + return num_non_empty_levels_; + } + + // REQUIRES: This version has been finalized. + // (CalculateBaseBytes() is called) + // This may or may not return number of level files. It is to keep backward + // compatible behavior in universal compaction. + int l0_delay_trigger_count() const { return l0_delay_trigger_count_; } + + void set_l0_delay_trigger_count(int v) { l0_delay_trigger_count_ = v; } + + // REQUIRES: This version has been saved (see VersionSet::SaveTo) + int NumLevelFiles(int level) const { + assert(finalized_); + return static_cast(files_[level].size()); + } + + // Return the combined file size of all files at the specified level. + uint64_t NumLevelBytes(int level) const; + + // REQUIRES: This version has been saved (see VersionSet::SaveTo) + const std::vector& LevelFiles(int level) const { + return files_[level]; + } + + const ROCKSDB_NAMESPACE::LevelFilesBrief& LevelFilesBrief(int level) const { + assert(level < static_cast(level_files_brief_.size())); + return level_files_brief_[level]; + } + + // REQUIRES: This version has been saved (see VersionSet::SaveTo) + const std::vector& FilesByCompactionPri(int level) const { + assert(finalized_); + return files_by_compaction_pri_[level]; + } + + // REQUIRES: This version has been saved (see VersionSet::SaveTo) + // REQUIRES: DB mutex held during access + const autovector>& FilesMarkedForCompaction() + const { + assert(finalized_); + return files_marked_for_compaction_; + } + + // REQUIRES: This version has been saved (see VersionSet::SaveTo) + // REQUIRES: DB mutex held during access + const autovector>& ExpiredTtlFiles() const { + assert(finalized_); + return expired_ttl_files_; + } + + // REQUIRES: This version has been saved (see VersionSet::SaveTo) + // REQUIRES: DB mutex held during access + const autovector>& + FilesMarkedForPeriodicCompaction() const { + assert(finalized_); + return files_marked_for_periodic_compaction_; + } + + void TEST_AddFileMarkedForPeriodicCompaction(int level, FileMetaData* f) { + files_marked_for_periodic_compaction_.emplace_back(level, f); + } + + // REQUIRES: This version has been saved (see VersionSet::SaveTo) + // REQUIRES: DB mutex held during access + const autovector>& + BottommostFilesMarkedForCompaction() const { + assert(finalized_); + return bottommost_files_marked_for_compaction_; + } + + int base_level() const { return base_level_; } + double level_multiplier() const { return level_multiplier_; } + + // REQUIRES: lock is held + // Set the index that is used to offset into files_by_compaction_pri_ to find + // the next compaction candidate file. + void SetNextCompactionIndex(int level, int index) { + next_file_to_compact_by_size_[level] = index; + } + + // REQUIRES: lock is held + int NextCompactionIndex(int level) const { + return next_file_to_compact_by_size_[level]; + } + + // REQUIRES: This version has been saved (see VersionSet::SaveTo) + const FileIndexer& file_indexer() const { + assert(finalized_); + return file_indexer_; + } + + // Only the first few entries of files_by_compaction_pri_ are sorted. + // There is no need to sort all the files because it is likely + // that on a running system, we need to look at only the first + // few largest files because a new version is created every few + // seconds/minutes (because of concurrent compactions). + static const size_t kNumberFilesToSort = 50; + + // Return a human-readable short (single-line) summary of the number + // of files per level. Uses *scratch as backing store. + struct LevelSummaryStorage { + char buffer[1000]; + }; + struct FileSummaryStorage { + char buffer[3000]; + }; + const char* LevelSummary(LevelSummaryStorage* scratch) const; + // Return a human-readable short (single-line) summary of files + // in a specified level. Uses *scratch as backing store. + const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const; + + // Return the maximum overlapping data (in bytes) at next level for any + // file at a level >= 1. + int64_t MaxNextLevelOverlappingBytes(); + + // Return a human readable string that describes this version's contents. + std::string DebugString(bool hex = false) const; + + uint64_t GetAverageValueSize() const { + if (accumulated_num_non_deletions_ == 0) { + return 0; + } + assert(accumulated_raw_key_size_ + accumulated_raw_value_size_ > 0); + assert(accumulated_file_size_ > 0); + return accumulated_raw_value_size_ / accumulated_num_non_deletions_ * + accumulated_file_size_ / + (accumulated_raw_key_size_ + accumulated_raw_value_size_); + } + + uint64_t GetEstimatedActiveKeys() const; + + double GetEstimatedCompressionRatioAtLevel(int level) const; + + // re-initializes the index that is used to offset into + // files_by_compaction_pri_ + // to find the next compaction candidate file. + void ResetNextCompactionIndex(int level) { + next_file_to_compact_by_size_[level] = 0; + } + + const InternalKeyComparator* InternalComparator() { + return internal_comparator_; + } + + // Returns maximum total bytes of data on a given level. + uint64_t MaxBytesForLevel(int level) const; + + // Must be called after any change to MutableCFOptions. + void CalculateBaseBytes(const ImmutableCFOptions& ioptions, + const MutableCFOptions& options); + + // Returns an estimate of the amount of live data in bytes. + uint64_t EstimateLiveDataSize() const; + + uint64_t estimated_compaction_needed_bytes() const { + return estimated_compaction_needed_bytes_; + } + + void TEST_set_estimated_compaction_needed_bytes(uint64_t v) { + estimated_compaction_needed_bytes_ = v; + } + + bool force_consistency_checks() const { return force_consistency_checks_; } + + SequenceNumber bottommost_files_mark_threshold() const { + return bottommost_files_mark_threshold_; + } + + // Returns whether any key in [`smallest_key`, `largest_key`] could appear in + // an older L0 file than `last_l0_idx` or in a greater level than `last_level` + // + // @param last_level Level after which we check for overlap + // @param last_l0_idx If `last_level == 0`, index of L0 file after which we + // check for overlap; otherwise, must be -1 + bool RangeMightExistAfterSortedRun(const Slice& smallest_user_key, + const Slice& largest_user_key, + int last_level, int last_l0_idx); + + private: + const InternalKeyComparator* internal_comparator_; + const Comparator* user_comparator_; + int num_levels_; // Number of levels + int num_non_empty_levels_; // Number of levels. Any level larger than it + // is guaranteed to be empty. + // Per-level max bytes + std::vector level_max_bytes_; + + // A short brief metadata of files per level + autovector level_files_brief_; + FileIndexer file_indexer_; + Arena arena_; // Used to allocate space for file_levels_ + + CompactionStyle compaction_style_; + + // List of files per level, files in each level are arranged + // in increasing order of keys + std::vector* files_; + + // Level that L0 data should be compacted to. All levels < base_level_ should + // be empty. -1 if it is not level-compaction so it's not applicable. + int base_level_; + + double level_multiplier_; + + // A list for the same set of files that are stored in files_, + // but files in each level are now sorted based on file + // size. The file with the largest size is at the front. + // This vector stores the index of the file from files_. + std::vector> files_by_compaction_pri_; + + // If true, means that files in L0 have keys with non overlapping ranges + bool level0_non_overlapping_; + + // An index into files_by_compaction_pri_ that specifies the first + // file that is not yet compacted + std::vector next_file_to_compact_by_size_; + + // Only the first few entries of files_by_compaction_pri_ are sorted. + // There is no need to sort all the files because it is likely + // that on a running system, we need to look at only the first + // few largest files because a new version is created every few + // seconds/minutes (because of concurrent compactions). + static const size_t number_of_files_to_sort_ = 50; + + // This vector contains list of files marked for compaction and also not + // currently being compacted. It is protected by DB mutex. It is calculated in + // ComputeCompactionScore() + autovector> files_marked_for_compaction_; + + autovector> expired_ttl_files_; + + autovector> + files_marked_for_periodic_compaction_; + + // These files are considered bottommost because none of their keys can exist + // at lower levels. They are not necessarily all in the same level. The marked + // ones are eligible for compaction because they contain duplicate key + // versions that are no longer protected by snapshot. These variables are + // protected by DB mutex and are calculated in `GenerateBottommostFiles()` and + // `ComputeBottommostFilesMarkedForCompaction()`. + autovector> bottommost_files_; + autovector> + bottommost_files_marked_for_compaction_; + + // Threshold for needing to mark another bottommost file. Maintain it so we + // can quickly check when releasing a snapshot whether more bottommost files + // became eligible for compaction. It's defined as the min of the max nonzero + // seqnums of unmarked bottommost files. + SequenceNumber bottommost_files_mark_threshold_ = kMaxSequenceNumber; + + // Monotonically increases as we release old snapshots. Zero indicates no + // snapshots have been released yet. When no snapshots remain we set it to the + // current seqnum, which needs to be protected as a snapshot can still be + // created that references it. + SequenceNumber oldest_snapshot_seqnum_ = 0; + + // Level that should be compacted next and its compaction score. + // Score < 1 means compaction is not strictly needed. These fields + // are initialized by Finalize(). + // The most critical level to be compacted is listed first + // These are used to pick the best compaction level + std::vector compaction_score_; + std::vector compaction_level_; + int l0_delay_trigger_count_ = 0; // Count used to trigger slow down and stop + // for number of L0 files. + + // the following are the sampled temporary stats. + // the current accumulated size of sampled files. + uint64_t accumulated_file_size_; + // the current accumulated size of all raw keys based on the sampled files. + uint64_t accumulated_raw_key_size_; + // the current accumulated size of all raw keys based on the sampled files. + uint64_t accumulated_raw_value_size_; + // total number of non-deletion entries + uint64_t accumulated_num_non_deletions_; + // total number of deletion entries + uint64_t accumulated_num_deletions_; + // current number of non_deletion entries + uint64_t current_num_non_deletions_; + // current number of deletion entries + uint64_t current_num_deletions_; + // current number of file samples + uint64_t current_num_samples_; + // Estimated bytes needed to be compacted until all levels' size is down to + // target sizes. + uint64_t estimated_compaction_needed_bytes_; + + bool finalized_; + + // If set to true, we will run consistency checks even if RocksDB + // is compiled in release mode + bool force_consistency_checks_; + + friend class Version; + friend class VersionSet; +}; + +using MultiGetRange = MultiGetContext::Range; +// A column family's version consists of the SST files owned by the column +// family at a certain point in time. +class Version { + public: + // Append to *iters a sequence of iterators that will + // yield the contents of this Version when merged together. + // REQUIRES: This version has been saved (see VersionSet::SaveTo) + void AddIterators(const ReadOptions&, const FileOptions& soptions, + MergeIteratorBuilder* merger_iter_builder, + RangeDelAggregator* range_del_agg); + + void AddIteratorsForLevel(const ReadOptions&, const FileOptions& soptions, + MergeIteratorBuilder* merger_iter_builder, + int level, RangeDelAggregator* range_del_agg); + + Status OverlapWithLevelIterator(const ReadOptions&, const FileOptions&, + const Slice& smallest_user_key, + const Slice& largest_user_key, + int level, bool* overlap); + + // Lookup the value for key or get all merge operands for key. + // If do_merge = true (default) then lookup value for key. + // Behavior if do_merge = true: + // If found, store it in *value and + // return OK. Else return a non-OK status. + // Uses *operands to store merge_operator operations to apply later. + // + // If the ReadOptions.read_tier is set to do a read-only fetch, then + // *value_found will be set to false if it cannot be determined whether + // this value exists without doing IO. + // + // If the key is Deleted, *status will be set to NotFound and + // *key_exists will be set to true. + // If no key was found, *status will be set to NotFound and + // *key_exists will be set to false. + // If seq is non-null, *seq will be set to the sequence number found + // for the key if a key was found. + // Behavior if do_merge = false + // If the key has any merge operands then store them in + // merge_context.operands_list and don't merge the operands + // REQUIRES: lock is not held + void Get(const ReadOptions&, const LookupKey& key, PinnableSlice* value, + Status* status, MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, + bool* value_found = nullptr, bool* key_exists = nullptr, + SequenceNumber* seq = nullptr, ReadCallback* callback = nullptr, + bool* is_blob = nullptr, bool do_merge = true); + + void MultiGet(const ReadOptions&, MultiGetRange* range, + ReadCallback* callback = nullptr, bool* is_blob = nullptr); + + // Loads some stats information from files. Call without mutex held. It needs + // to be called before applying the version to the version set. + void PrepareApply(const MutableCFOptions& mutable_cf_options, + bool update_stats); + + // Reference count management (so Versions do not disappear out from + // under live iterators) + void Ref(); + // Decrease reference count. Delete the object if no reference left + // and return true. Otherwise, return false. + bool Unref(); + + // Add all files listed in the current version to *live. + void AddLiveFiles(std::vector* live); + + // Return a human readable string that describes this version's contents. + std::string DebugString(bool hex = false, bool print_stats = false) const; + + // Returns the version number of this version + uint64_t GetVersionNumber() const { return version_number_; } + + // REQUIRES: lock is held + // On success, "tp" will contains the table properties of the file + // specified in "file_meta". If the file name of "file_meta" is + // known ahead, passing it by a non-null "fname" can save a + // file-name conversion. + Status GetTableProperties(std::shared_ptr* tp, + const FileMetaData* file_meta, + const std::string* fname = nullptr) const; + + // REQUIRES: lock is held + // On success, *props will be populated with all SSTables' table properties. + // The keys of `props` are the sst file name, the values of `props` are the + // tables' properties, represented as std::shared_ptr. + Status GetPropertiesOfAllTables(TablePropertiesCollection* props); + Status GetPropertiesOfAllTables(TablePropertiesCollection* props, int level); + Status GetPropertiesOfTablesInRange(const Range* range, std::size_t n, + TablePropertiesCollection* props) const; + + // Print summary of range delete tombstones in SST files into out_str, + // with maximum max_entries_to_print entries printed out. + Status TablesRangeTombstoneSummary(int max_entries_to_print, + std::string* out_str); + + // REQUIRES: lock is held + // On success, "tp" will contains the aggregated table property among + // the table properties of all sst files in this version. + Status GetAggregatedTableProperties( + std::shared_ptr* tp, int level = -1); + + uint64_t GetEstimatedActiveKeys() { + return storage_info_.GetEstimatedActiveKeys(); + } + + size_t GetMemoryUsageByTableReaders(); + + ColumnFamilyData* cfd() const { return cfd_; } + + // Return the next Version in the linked list. Used for debug only + Version* TEST_Next() const { + return next_; + } + + int TEST_refs() const { return refs_; } + + VersionStorageInfo* storage_info() { return &storage_info_; } + + VersionSet* version_set() { return vset_; } + + void GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta); + + uint64_t GetSstFilesSize(); + + // Retrieves the file_creation_time of the oldest file in the DB. + // Prerequisite for this API is max_open_files = -1 + void GetCreationTimeOfOldestFile(uint64_t* creation_time); + + const MutableCFOptions& GetMutableCFOptions() { return mutable_cf_options_; } + + private: + Env* env_; + FileSystem* fs_; + friend class ReactiveVersionSet; + friend class VersionSet; + + const InternalKeyComparator* internal_comparator() const { + return storage_info_.internal_comparator_; + } + const Comparator* user_comparator() const { + return storage_info_.user_comparator_; + } + + bool PrefixMayMatch(const ReadOptions& read_options, + InternalIterator* level_iter, + const Slice& internal_prefix) const; + + // Returns true if the filter blocks in the specified level will not be + // checked during read operations. In certain cases (trivial move or preload), + // the filter block may already be cached, but we still do not access it such + // that it eventually expires from the cache. + bool IsFilterSkipped(int level, bool is_file_last_in_level = false); + + // The helper function of UpdateAccumulatedStats, which may fill the missing + // fields of file_meta from its associated TableProperties. + // Returns true if it does initialize FileMetaData. + bool MaybeInitializeFileMetaData(FileMetaData* file_meta); + + // Update the accumulated stats associated with the current version. + // This accumulated stats will be used in compaction. + void UpdateAccumulatedStats(bool update_stats); + + // Sort all files for this version based on their file size and + // record results in files_by_compaction_pri_. The largest files are listed + // first. + void UpdateFilesByCompactionPri(); + + ColumnFamilyData* cfd_; // ColumnFamilyData to which this Version belongs + Logger* info_log_; + Statistics* db_statistics_; + TableCache* table_cache_; + const MergeOperator* merge_operator_; + + VersionStorageInfo storage_info_; + VersionSet* vset_; // VersionSet to which this Version belongs + Version* next_; // Next version in linked list + Version* prev_; // Previous version in linked list + int refs_; // Number of live refs to this version + const FileOptions file_options_; + const MutableCFOptions mutable_cf_options_; + + // A version number that uniquely represents this version. This is + // used for debugging and logging purposes only. + uint64_t version_number_; + + Version(ColumnFamilyData* cfd, VersionSet* vset, const FileOptions& file_opt, + MutableCFOptions mutable_cf_options, uint64_t version_number = 0); + + ~Version(); + + // No copying allowed + Version(const Version&) = delete; + void operator=(const Version&) = delete; +}; + +struct ObsoleteFileInfo { + FileMetaData* metadata; + std::string path; + + ObsoleteFileInfo() noexcept : metadata(nullptr) {} + ObsoleteFileInfo(FileMetaData* f, const std::string& file_path) + : metadata(f), path(file_path) {} + + ObsoleteFileInfo(const ObsoleteFileInfo&) = delete; + ObsoleteFileInfo& operator=(const ObsoleteFileInfo&) = delete; + + ObsoleteFileInfo(ObsoleteFileInfo&& rhs) noexcept : + ObsoleteFileInfo() { + *this = std::move(rhs); + } + + ObsoleteFileInfo& operator=(ObsoleteFileInfo&& rhs) noexcept { + path = std::move(rhs.path); + metadata = rhs.metadata; + rhs.metadata = nullptr; + + return *this; + } + + void DeleteMetadata() { + delete metadata; + metadata = nullptr; + } +}; + +class BaseReferencedVersionBuilder; + +class AtomicGroupReadBuffer { + public: + Status AddEdit(VersionEdit* edit); + void Clear(); + bool IsFull() const; + bool IsEmpty() const; + + uint64_t TEST_read_edits_in_atomic_group() const { + return read_edits_in_atomic_group_; + } + std::vector& replay_buffer() { return replay_buffer_; } + + private: + uint64_t read_edits_in_atomic_group_ = 0; + std::vector replay_buffer_; +}; + +// VersionSet is the collection of versions of all the column families of the +// database. Each database owns one VersionSet. A VersionSet has access to all +// column families via ColumnFamilySet, i.e. set of the column families. +class VersionSet { + public: + VersionSet(const std::string& dbname, const ImmutableDBOptions* db_options, + const FileOptions& file_options, Cache* table_cache, + WriteBufferManager* write_buffer_manager, + WriteController* write_controller, + BlockCacheTracer* const block_cache_tracer); + // No copying allowed + VersionSet(const VersionSet&) = delete; + void operator=(const VersionSet&) = delete; + + virtual ~VersionSet(); + + // Apply *edit to the current version to form a new descriptor that + // is both saved to persistent state and installed as the new + // current version. Will release *mu while actually writing to the file. + // column_family_options has to be set if edit is column family add + // REQUIRES: *mu is held on entry. + // REQUIRES: no other thread concurrently calls LogAndApply() + Status LogAndApply( + ColumnFamilyData* column_family_data, + const MutableCFOptions& mutable_cf_options, VersionEdit* edit, + InstrumentedMutex* mu, Directory* db_directory = nullptr, + bool new_descriptor_log = false, + const ColumnFamilyOptions* column_family_options = nullptr) { + autovector cfds; + cfds.emplace_back(column_family_data); + autovector mutable_cf_options_list; + mutable_cf_options_list.emplace_back(&mutable_cf_options); + autovector> edit_lists; + autovector edit_list; + edit_list.emplace_back(edit); + edit_lists.emplace_back(edit_list); + return LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu, + db_directory, new_descriptor_log, column_family_options); + } + // The batch version. If edit_list.size() > 1, caller must ensure that + // no edit in the list column family add or drop + Status LogAndApply( + ColumnFamilyData* column_family_data, + const MutableCFOptions& mutable_cf_options, + const autovector& edit_list, InstrumentedMutex* mu, + Directory* db_directory = nullptr, bool new_descriptor_log = false, + const ColumnFamilyOptions* column_family_options = nullptr) { + autovector cfds; + cfds.emplace_back(column_family_data); + autovector mutable_cf_options_list; + mutable_cf_options_list.emplace_back(&mutable_cf_options); + autovector> edit_lists; + edit_lists.emplace_back(edit_list); + return LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu, + db_directory, new_descriptor_log, column_family_options); + } + + // The across-multi-cf batch version. If edit_lists contain more than + // 1 version edits, caller must ensure that no edit in the []list is column + // family manipulation. + virtual Status LogAndApply( + const autovector& cfds, + const autovector& mutable_cf_options_list, + const autovector>& edit_lists, + InstrumentedMutex* mu, Directory* db_directory = nullptr, + bool new_descriptor_log = false, + const ColumnFamilyOptions* new_cf_options = nullptr); + + static Status GetCurrentManifestPath(const std::string& dbname, + FileSystem* fs, + std::string* manifest_filename, + uint64_t* manifest_file_number); + + // Recover the last saved descriptor from persistent storage. + // If read_only == true, Recover() will not complain if some column families + // are not opened + Status Recover(const std::vector& column_families, + bool read_only = false, std::string* db_id = nullptr); + + // Reads a manifest file and returns a list of column families in + // column_families. + static Status ListColumnFamilies(std::vector* column_families, + const std::string& dbname, FileSystem* fs); + +#ifndef ROCKSDB_LITE + // Try to reduce the number of levels. This call is valid when + // only one level from the new max level to the old + // max level containing files. + // The call is static, since number of levels is immutable during + // the lifetime of a RocksDB instance. It reduces number of levels + // in a DB by applying changes to manifest. + // For example, a db currently has 7 levels [0-6], and a call to + // to reduce to 5 [0-4] can only be executed when only one level + // among [4-6] contains files. + static Status ReduceNumberOfLevels(const std::string& dbname, + const Options* options, + const FileOptions& file_options, + int new_levels); + + // Get the checksum information of all live files + Status GetLiveFilesChecksumInfo(FileChecksumList* checksum_list); + + // printf contents (for debugging) + Status DumpManifest(Options& options, std::string& manifestFileName, + bool verbose, bool hex = false, bool json = false); + +#endif // ROCKSDB_LITE + + // Return the current manifest file number + uint64_t manifest_file_number() const { return manifest_file_number_; } + + uint64_t options_file_number() const { return options_file_number_; } + + uint64_t pending_manifest_file_number() const { + return pending_manifest_file_number_; + } + + uint64_t current_next_file_number() const { return next_file_number_.load(); } + + uint64_t min_log_number_to_keep_2pc() const { + return min_log_number_to_keep_2pc_.load(); + } + + // Allocate and return a new file number + uint64_t NewFileNumber() { return next_file_number_.fetch_add(1); } + + // Fetch And Add n new file number + uint64_t FetchAddFileNumber(uint64_t n) { + return next_file_number_.fetch_add(n); + } + + // Return the last sequence number. + uint64_t LastSequence() const { + return last_sequence_.load(std::memory_order_acquire); + } + + // Note: memory_order_acquire must be sufficient. + uint64_t LastAllocatedSequence() const { + return last_allocated_sequence_.load(std::memory_order_seq_cst); + } + + // Note: memory_order_acquire must be sufficient. + uint64_t LastPublishedSequence() const { + return last_published_sequence_.load(std::memory_order_seq_cst); + } + + // Set the last sequence number to s. + void SetLastSequence(uint64_t s) { + assert(s >= last_sequence_); + // Last visible sequence must always be less than last written seq + assert(!db_options_->two_write_queues || s <= last_allocated_sequence_); + last_sequence_.store(s, std::memory_order_release); + } + + // Note: memory_order_release must be sufficient + void SetLastPublishedSequence(uint64_t s) { + assert(s >= last_published_sequence_); + last_published_sequence_.store(s, std::memory_order_seq_cst); + } + + // Note: memory_order_release must be sufficient + void SetLastAllocatedSequence(uint64_t s) { + assert(s >= last_allocated_sequence_); + last_allocated_sequence_.store(s, std::memory_order_seq_cst); + } + + // Note: memory_order_release must be sufficient + uint64_t FetchAddLastAllocatedSequence(uint64_t s) { + return last_allocated_sequence_.fetch_add(s, std::memory_order_seq_cst); + } + + // Mark the specified file number as used. + // REQUIRED: this is only called during single-threaded recovery or repair. + void MarkFileNumberUsed(uint64_t number); + + // Mark the specified log number as deleted + // REQUIRED: this is only called during single-threaded recovery or repair, or + // from ::LogAndApply where the global mutex is held. + void MarkMinLogNumberToKeep2PC(uint64_t number); + + // Return the log file number for the log file that is currently + // being compacted, or zero if there is no such log file. + uint64_t prev_log_number() const { return prev_log_number_; } + + // Returns the minimum log number which still has data not flushed to any SST + // file. + // In non-2PC mode, all the log numbers smaller than this number can be safely + // deleted. + uint64_t MinLogNumberWithUnflushedData() const { + return PreComputeMinLogNumberWithUnflushedData(nullptr); + } + // Returns the minimum log number which still has data not flushed to any SST + // file, except data from `cfd_to_skip`. + uint64_t PreComputeMinLogNumberWithUnflushedData( + const ColumnFamilyData* cfd_to_skip) const { + uint64_t min_log_num = std::numeric_limits::max(); + for (auto cfd : *column_family_set_) { + if (cfd == cfd_to_skip) { + continue; + } + // It's safe to ignore dropped column families here: + // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST. + if (min_log_num > cfd->GetLogNumber() && !cfd->IsDropped()) { + min_log_num = cfd->GetLogNumber(); + } + } + return min_log_num; + } + + // Create an iterator that reads over the compaction inputs for "*c". + // The caller should delete the iterator when no longer needed. + InternalIterator* MakeInputIterator( + const Compaction* c, RangeDelAggregator* range_del_agg, + const FileOptions& file_options_compactions); + + // Add all files listed in any live version to *live. + void AddLiveFiles(std::vector* live_list); + + // Return the approximate size of data to be scanned for range [start, end) + // in levels [start_level, end_level). If end_level == -1 it will search + // through all non-empty levels + uint64_t ApproximateSize(const SizeApproximationOptions& options, Version* v, + const Slice& start, const Slice& end, + int start_level, int end_level, + TableReaderCaller caller); + + // Return the size of the current manifest file + uint64_t manifest_file_size() const { return manifest_file_size_; } + + // verify that the files that we started with for a compaction + // still exist in the current version and in the same original level. + // This ensures that a concurrent compaction did not erroneously + // pick the same files to compact. + bool VerifyCompactionFileConsistency(Compaction* c); + + Status GetMetadataForFile(uint64_t number, int* filelevel, + FileMetaData** metadata, ColumnFamilyData** cfd); + + // This function doesn't support leveldb SST filenames + void GetLiveFilesMetaData(std::vector *metadata); + + void GetObsoleteFiles(std::vector* files, + std::vector* manifest_filenames, + uint64_t min_pending_output); + + ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); } + const FileOptions& file_options() { return file_options_; } + void ChangeFileOptions(const MutableDBOptions& new_options) { + file_options_.writable_file_max_buffer_size = + new_options.writable_file_max_buffer_size; + } + + const ImmutableDBOptions* db_options() const { return db_options_; } + + static uint64_t GetNumLiveVersions(Version* dummy_versions); + + static uint64_t GetTotalSstFilesSize(Version* dummy_versions); + + protected: + struct ManifestWriter; + + friend class Version; + friend class DBImpl; + friend class DBImplReadOnly; + + struct LogReporter : public log::Reader::Reporter { + Status* status; + virtual void Corruption(size_t /*bytes*/, const Status& s) override { + if (this->status->ok()) *this->status = s; + } + }; + + // Returns approximated offset of a key in a file for a given version. + uint64_t ApproximateOffsetOf(Version* v, const FdWithKeyRange& f, + const Slice& key, TableReaderCaller caller); + + // Returns approximated data size between start and end keys in a file + // for a given version. + uint64_t ApproximateSize(Version* v, const FdWithKeyRange& f, + const Slice& start, const Slice& end, + TableReaderCaller caller); + + struct MutableCFState { + uint64_t log_number; + }; + + // Save current contents to *log + Status WriteCurrentStateToManifest( + const std::unordered_map& curr_state, + log::Writer* log); + + void AppendVersion(ColumnFamilyData* column_family_data, Version* v); + + ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options, + VersionEdit* edit); + + Status ReadAndRecover( + log::Reader* reader, AtomicGroupReadBuffer* read_buffer, + const std::unordered_map& + name_to_options, + std::unordered_map& column_families_not_found, + std::unordered_map< + uint32_t, std::unique_ptr>& builders, + VersionEditParams* version_edit, std::string* db_id = nullptr); + + // REQUIRES db mutex + Status ApplyOneVersionEditToBuilder( + VersionEdit& edit, + const std::unordered_map& name_to_opts, + std::unordered_map& column_families_not_found, + std::unordered_map< + uint32_t, std::unique_ptr>& builders, + VersionEditParams* version_edit); + + Status ExtractInfoFromVersionEdit(ColumnFamilyData* cfd, + const VersionEdit& from_edit, + VersionEditParams* version_edit_params); + + std::unique_ptr column_family_set_; + + Env* const env_; + FileSystem* const fs_; + const std::string dbname_; + std::string db_id_; + const ImmutableDBOptions* const db_options_; + std::atomic next_file_number_; + // Any log number equal or lower than this should be ignored during recovery, + // and is qualified for being deleted in 2PC mode. In non-2PC mode, this + // number is ignored. + std::atomic min_log_number_to_keep_2pc_ = {0}; + uint64_t manifest_file_number_; + uint64_t options_file_number_; + uint64_t pending_manifest_file_number_; + // The last seq visible to reads. It normally indicates the last sequence in + // the memtable but when using two write queues it could also indicate the + // last sequence in the WAL visible to reads. + std::atomic last_sequence_; + // The last seq that is already allocated. It is applicable only when we have + // two write queues. In that case seq might or might not have appreated in + // memtable but it is expected to appear in the WAL. + // We have last_sequence <= last_allocated_sequence_ + std::atomic last_allocated_sequence_; + // The last allocated sequence that is also published to the readers. This is + // applicable only when last_seq_same_as_publish_seq_ is not set. Otherwise + // last_sequence_ also indicates the last published seq. + // We have last_sequence <= last_published_sequence_ <= + // last_allocated_sequence_ + std::atomic last_published_sequence_; + uint64_t prev_log_number_; // 0 or backing store for memtable being compacted + + // Opened lazily + std::unique_ptr descriptor_log_; + + // generates a increasing version number for every new version + uint64_t current_version_number_; + + // Queue of writers to the manifest file + std::deque manifest_writers_; + + // Current size of manifest file + uint64_t manifest_file_size_; + + std::vector obsolete_files_; + std::vector obsolete_manifests_; + + // env options for all reads and writes except compactions + FileOptions file_options_; + + BlockCacheTracer* const block_cache_tracer_; + + private: + // REQUIRES db mutex at beginning. may release and re-acquire db mutex + Status ProcessManifestWrites(std::deque& writers, + InstrumentedMutex* mu, Directory* db_directory, + bool new_descriptor_log, + const ColumnFamilyOptions* new_cf_options); + + void LogAndApplyCFHelper(VersionEdit* edit); + Status LogAndApplyHelper(ColumnFamilyData* cfd, VersionBuilder* b, + VersionEdit* edit, InstrumentedMutex* mu); +}; + +// ReactiveVersionSet represents a collection of versions of the column +// families of the database. Users of ReactiveVersionSet, e.g. DBImplSecondary, +// need to replay the MANIFEST (description log in older terms) in order to +// reconstruct and install versions. +class ReactiveVersionSet : public VersionSet { + public: + ReactiveVersionSet(const std::string& dbname, + const ImmutableDBOptions* _db_options, + const FileOptions& _file_options, Cache* table_cache, + WriteBufferManager* write_buffer_manager, + WriteController* write_controller); + + ~ReactiveVersionSet() override; + + Status ReadAndApply( + InstrumentedMutex* mu, + std::unique_ptr* manifest_reader, + std::unordered_set* cfds_changed); + + Status Recover(const std::vector& column_families, + std::unique_ptr* manifest_reader, + std::unique_ptr* manifest_reporter, + std::unique_ptr* manifest_reader_status); + + uint64_t TEST_read_edits_in_atomic_group() const { + return read_buffer_.TEST_read_edits_in_atomic_group(); + } + std::vector& replay_buffer() { + return read_buffer_.replay_buffer(); + } + + protected: + using VersionSet::ApplyOneVersionEditToBuilder; + + // REQUIRES db mutex + Status ApplyOneVersionEditToBuilder( + VersionEdit& edit, std::unordered_set* cfds_changed, + VersionEdit* version_edit); + + Status MaybeSwitchManifest( + log::Reader::Reporter* reporter, + std::unique_ptr* manifest_reader); + + private: + std::unordered_map> + active_version_builders_; + AtomicGroupReadBuffer read_buffer_; + // Number of version edits to skip by ReadAndApply at the beginning of a new + // MANIFEST created by primary. + int number_of_edits_to_skip_; + + using VersionSet::LogAndApply; + using VersionSet::Recover; + + Status LogAndApply( + const autovector& /*cfds*/, + const autovector& /*mutable_cf_options_list*/, + const autovector>& /*edit_lists*/, + InstrumentedMutex* /*mu*/, Directory* /*db_directory*/, + bool /*new_descriptor_log*/, + const ColumnFamilyOptions* /*new_cf_option*/) override { + return Status::NotSupported("not supported in reactive mode"); + } + + // No copy allowed + ReactiveVersionSet(const ReactiveVersionSet&); + ReactiveVersionSet& operator=(const ReactiveVersionSet&); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/version_set_test.cc b/src/rocksdb/db/version_set_test.cc new file mode 100644 index 000000000..03e0e26d2 --- /dev/null +++ b/src/rocksdb/db/version_set_test.cc @@ -0,0 +1,1287 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_set.h" +#include "db/db_impl/db_impl.h" +#include "db/log_writer.h" +#include "logging/logging.h" +#include "table/mock_table.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +class GenerateLevelFilesBriefTest : public testing::Test { + public: + std::vector files_; + LevelFilesBrief file_level_; + Arena arena_; + + GenerateLevelFilesBriefTest() { } + + ~GenerateLevelFilesBriefTest() override { + for (size_t i = 0; i < files_.size(); i++) { + delete files_[i]; + } + } + + void Add(const char* smallest, const char* largest, + SequenceNumber smallest_seq = 100, + SequenceNumber largest_seq = 100) { + FileMetaData* f = new FileMetaData( + files_.size() + 1, 0, 0, + InternalKey(smallest, smallest_seq, kTypeValue), + InternalKey(largest, largest_seq, kTypeValue), smallest_seq, + largest_seq, /* marked_for_compact */ false, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName); + files_.push_back(f); + } + + int Compare() { + int diff = 0; + for (size_t i = 0; i < files_.size(); i++) { + if (file_level_.files[i].fd.GetNumber() != files_[i]->fd.GetNumber()) { + diff++; + } + } + return diff; + } +}; + +TEST_F(GenerateLevelFilesBriefTest, Empty) { + DoGenerateLevelFilesBrief(&file_level_, files_, &arena_); + ASSERT_EQ(0u, file_level_.num_files); + ASSERT_EQ(0, Compare()); +} + +TEST_F(GenerateLevelFilesBriefTest, Single) { + Add("p", "q"); + DoGenerateLevelFilesBrief(&file_level_, files_, &arena_); + ASSERT_EQ(1u, file_level_.num_files); + ASSERT_EQ(0, Compare()); +} + +TEST_F(GenerateLevelFilesBriefTest, Multiple) { + Add("150", "200"); + Add("200", "250"); + Add("300", "350"); + Add("400", "450"); + DoGenerateLevelFilesBrief(&file_level_, files_, &arena_); + ASSERT_EQ(4u, file_level_.num_files); + ASSERT_EQ(0, Compare()); +} + +class CountingLogger : public Logger { + public: + CountingLogger() : log_count(0) {} + using Logger::Logv; + void Logv(const char* /*format*/, va_list /*ap*/) override { log_count++; } + int log_count; +}; + +Options GetOptionsWithNumLevels(int num_levels, + std::shared_ptr logger) { + Options opt; + opt.num_levels = num_levels; + opt.info_log = logger; + return opt; +} + +class VersionStorageInfoTest : public testing::Test { + public: + const Comparator* ucmp_; + InternalKeyComparator icmp_; + std::shared_ptr logger_; + Options options_; + ImmutableCFOptions ioptions_; + MutableCFOptions mutable_cf_options_; + VersionStorageInfo vstorage_; + + InternalKey GetInternalKey(const char* ukey, + SequenceNumber smallest_seq = 100) { + return InternalKey(ukey, smallest_seq, kTypeValue); + } + + VersionStorageInfoTest() + : ucmp_(BytewiseComparator()), + icmp_(ucmp_), + logger_(new CountingLogger()), + options_(GetOptionsWithNumLevels(6, logger_)), + ioptions_(options_), + mutable_cf_options_(options_), + vstorage_(&icmp_, ucmp_, 6, kCompactionStyleLevel, nullptr, false) {} + + ~VersionStorageInfoTest() override { + for (int i = 0; i < vstorage_.num_levels(); i++) { + for (auto* f : vstorage_.LevelFiles(i)) { + if (--f->refs == 0) { + delete f; + } + } + } + } + + void Add(int level, uint32_t file_number, const char* smallest, + const char* largest, uint64_t file_size = 0) { + assert(level < vstorage_.num_levels()); + FileMetaData* f = new FileMetaData( + file_number, 0, file_size, GetInternalKey(smallest, 0), + GetInternalKey(largest, 0), /* smallest_seq */ 0, /* largest_seq */ 0, + /* marked_for_compact */ false, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName); + f->compensated_file_size = file_size; + vstorage_.AddFile(level, f); + } + + void Add(int level, uint32_t file_number, const InternalKey& smallest, + const InternalKey& largest, uint64_t file_size = 0) { + assert(level < vstorage_.num_levels()); + FileMetaData* f = new FileMetaData( + file_number, 0, file_size, smallest, largest, /* smallest_seq */ 0, + /* largest_seq */ 0, /* marked_for_compact */ false, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName); + f->compensated_file_size = file_size; + vstorage_.AddFile(level, f); + } + + std::string GetOverlappingFiles(int level, const InternalKey& begin, + const InternalKey& end) { + std::vector inputs; + vstorage_.GetOverlappingInputs(level, &begin, &end, &inputs); + + std::string result; + for (size_t i = 0; i < inputs.size(); ++i) { + if (i > 0) { + result += ","; + } + AppendNumberTo(&result, inputs[i]->fd.GetNumber()); + } + return result; + } +}; + +TEST_F(VersionStorageInfoTest, MaxBytesForLevelStatic) { + ioptions_.level_compaction_dynamic_level_bytes = false; + mutable_cf_options_.max_bytes_for_level_base = 10; + mutable_cf_options_.max_bytes_for_level_multiplier = 5; + Add(4, 100U, "1", "2"); + Add(5, 101U, "1", "2"); + + vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_); + ASSERT_EQ(vstorage_.MaxBytesForLevel(1), 10U); + ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 50U); + ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 250U); + ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 1250U); + + ASSERT_EQ(0, logger_->log_count); +} + +TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamic) { + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.max_bytes_for_level_base = 1000; + mutable_cf_options_.max_bytes_for_level_multiplier = 5; + Add(5, 1U, "1", "2", 500U); + + vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_); + ASSERT_EQ(0, logger_->log_count); + ASSERT_EQ(vstorage_.base_level(), 5); + + Add(5, 2U, "3", "4", 550U); + vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_); + ASSERT_EQ(0, logger_->log_count); + ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 1000U); + ASSERT_EQ(vstorage_.base_level(), 4); + + Add(4, 3U, "3", "4", 550U); + vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_); + ASSERT_EQ(0, logger_->log_count); + ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 1000U); + ASSERT_EQ(vstorage_.base_level(), 4); + + Add(3, 4U, "3", "4", 250U); + Add(3, 5U, "5", "7", 300U); + vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_); + ASSERT_EQ(1, logger_->log_count); + ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 1005U); + ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 1000U); + ASSERT_EQ(vstorage_.base_level(), 3); + + Add(1, 6U, "3", "4", 5U); + Add(1, 7U, "8", "9", 5U); + logger_->log_count = 0; + vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_); + ASSERT_EQ(1, logger_->log_count); + ASSERT_GT(vstorage_.MaxBytesForLevel(4), 1005U); + ASSERT_GT(vstorage_.MaxBytesForLevel(3), 1005U); + ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 1005U); + ASSERT_EQ(vstorage_.MaxBytesForLevel(1), 1000U); + ASSERT_EQ(vstorage_.base_level(), 1); +} + +TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicLotsOfData) { + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.max_bytes_for_level_base = 100; + mutable_cf_options_.max_bytes_for_level_multiplier = 2; + Add(0, 1U, "1", "2", 50U); + Add(1, 2U, "1", "2", 50U); + Add(2, 3U, "1", "2", 500U); + Add(3, 4U, "1", "2", 500U); + Add(4, 5U, "1", "2", 1700U); + Add(5, 6U, "1", "2", 500U); + + vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_); + ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 800U); + ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 400U); + ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 200U); + ASSERT_EQ(vstorage_.MaxBytesForLevel(1), 100U); + ASSERT_EQ(vstorage_.base_level(), 1); + ASSERT_EQ(0, logger_->log_count); +} + +TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicLargeLevel) { + uint64_t kOneGB = 1000U * 1000U * 1000U; + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.max_bytes_for_level_base = 10U * kOneGB; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + Add(0, 1U, "1", "2", 50U); + Add(3, 4U, "1", "2", 32U * kOneGB); + Add(4, 5U, "1", "2", 500U * kOneGB); + Add(5, 6U, "1", "2", 3000U * kOneGB); + + vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_); + ASSERT_EQ(vstorage_.MaxBytesForLevel(5), 3000U * kOneGB); + ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 300U * kOneGB); + ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 30U * kOneGB); + ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 10U * kOneGB); + ASSERT_EQ(vstorage_.base_level(), 2); + ASSERT_EQ(0, logger_->log_count); +} + +TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_1) { + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.max_bytes_for_level_base = 40000; + mutable_cf_options_.max_bytes_for_level_multiplier = 5; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + + Add(0, 1U, "1", "2", 10000U); + Add(0, 2U, "1", "2", 10000U); + Add(0, 3U, "1", "2", 10000U); + + Add(5, 4U, "1", "2", 1286250U); + Add(4, 5U, "1", "2", 200000U); + Add(3, 6U, "1", "2", 40000U); + Add(2, 7U, "1", "2", 8000U); + + vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_); + ASSERT_EQ(0, logger_->log_count); + ASSERT_EQ(2, vstorage_.base_level()); + // level multiplier should be 3.5 + ASSERT_EQ(vstorage_.level_multiplier(), 5.0); + // Level size should be around 30,000, 105,000, 367,500 + ASSERT_EQ(40000U, vstorage_.MaxBytesForLevel(2)); + ASSERT_EQ(51450U, vstorage_.MaxBytesForLevel(3)); + ASSERT_EQ(257250U, vstorage_.MaxBytesForLevel(4)); +} + +TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_2) { + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.max_bytes_for_level_base = 10000; + mutable_cf_options_.max_bytes_for_level_multiplier = 5; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + + Add(0, 11U, "1", "2", 10000U); + Add(0, 12U, "1", "2", 10000U); + Add(0, 13U, "1", "2", 10000U); + + Add(5, 4U, "1", "2", 1286250U); + Add(4, 5U, "1", "2", 200000U); + Add(3, 6U, "1", "2", 40000U); + Add(2, 7U, "1", "2", 8000U); + + vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_); + ASSERT_EQ(0, logger_->log_count); + ASSERT_EQ(2, vstorage_.base_level()); + // level multiplier should be 3.5 + ASSERT_LT(vstorage_.level_multiplier(), 3.6); + ASSERT_GT(vstorage_.level_multiplier(), 3.4); + // Level size should be around 30,000, 105,000, 367,500 + ASSERT_EQ(30000U, vstorage_.MaxBytesForLevel(2)); + ASSERT_LT(vstorage_.MaxBytesForLevel(3), 110000U); + ASSERT_GT(vstorage_.MaxBytesForLevel(3), 100000U); + ASSERT_LT(vstorage_.MaxBytesForLevel(4), 370000U); + ASSERT_GT(vstorage_.MaxBytesForLevel(4), 360000U); +} + +TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_3) { + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.max_bytes_for_level_base = 10000; + mutable_cf_options_.max_bytes_for_level_multiplier = 5; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + + Add(0, 11U, "1", "2", 5000U); + Add(0, 12U, "1", "2", 5000U); + Add(0, 13U, "1", "2", 5000U); + Add(0, 14U, "1", "2", 5000U); + Add(0, 15U, "1", "2", 5000U); + Add(0, 16U, "1", "2", 5000U); + + Add(5, 4U, "1", "2", 1286250U); + Add(4, 5U, "1", "2", 200000U); + Add(3, 6U, "1", "2", 40000U); + Add(2, 7U, "1", "2", 8000U); + + vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_); + ASSERT_EQ(0, logger_->log_count); + ASSERT_EQ(2, vstorage_.base_level()); + // level multiplier should be 3.5 + ASSERT_LT(vstorage_.level_multiplier(), 3.6); + ASSERT_GT(vstorage_.level_multiplier(), 3.4); + // Level size should be around 30,000, 105,000, 367,500 + ASSERT_EQ(30000U, vstorage_.MaxBytesForLevel(2)); + ASSERT_LT(vstorage_.MaxBytesForLevel(3), 110000U); + ASSERT_GT(vstorage_.MaxBytesForLevel(3), 100000U); + ASSERT_LT(vstorage_.MaxBytesForLevel(4), 370000U); + ASSERT_GT(vstorage_.MaxBytesForLevel(4), 360000U); +} + +TEST_F(VersionStorageInfoTest, EstimateLiveDataSize) { + // Test whether the overlaps are detected as expected + Add(1, 1U, "4", "7", 1U); // Perfect overlap with last level + Add(2, 2U, "3", "5", 1U); // Partial overlap with last level + Add(2, 3U, "6", "8", 1U); // Partial overlap with last level + Add(3, 4U, "1", "9", 1U); // Contains range of last level + Add(4, 5U, "4", "5", 1U); // Inside range of last level + Add(4, 5U, "6", "7", 1U); // Inside range of last level + Add(5, 6U, "4", "7", 10U); + ASSERT_EQ(10U, vstorage_.EstimateLiveDataSize()); +} + +TEST_F(VersionStorageInfoTest, EstimateLiveDataSize2) { + Add(0, 1U, "9", "9", 1U); // Level 0 is not ordered + Add(0, 1U, "5", "6", 1U); // Ignored because of [5,6] in l1 + Add(1, 1U, "1", "2", 1U); // Ignored because of [2,3] in l2 + Add(1, 2U, "3", "4", 1U); // Ignored because of [2,3] in l2 + Add(1, 3U, "5", "6", 1U); + Add(2, 4U, "2", "3", 1U); + Add(3, 5U, "7", "8", 1U); + ASSERT_EQ(4U, vstorage_.EstimateLiveDataSize()); +} + +TEST_F(VersionStorageInfoTest, GetOverlappingInputs) { + // Two files that overlap at the range deletion tombstone sentinel. + Add(1, 1U, {"a", 0, kTypeValue}, {"b", kMaxSequenceNumber, kTypeRangeDeletion}, 1); + Add(1, 2U, {"b", 0, kTypeValue}, {"c", 0, kTypeValue}, 1); + // Two files that overlap at the same user key. + Add(1, 3U, {"d", 0, kTypeValue}, {"e", kMaxSequenceNumber, kTypeValue}, 1); + Add(1, 4U, {"e", 0, kTypeValue}, {"f", 0, kTypeValue}, 1); + // Two files that do not overlap. + Add(1, 5U, {"g", 0, kTypeValue}, {"h", 0, kTypeValue}, 1); + Add(1, 6U, {"i", 0, kTypeValue}, {"j", 0, kTypeValue}, 1); + vstorage_.UpdateNumNonEmptyLevels(); + vstorage_.GenerateLevelFilesBrief(); + + ASSERT_EQ("1,2", GetOverlappingFiles( + 1, {"a", 0, kTypeValue}, {"b", 0, kTypeValue})); + ASSERT_EQ("1", GetOverlappingFiles( + 1, {"a", 0, kTypeValue}, {"b", kMaxSequenceNumber, kTypeRangeDeletion})); + ASSERT_EQ("2", GetOverlappingFiles( + 1, {"b", kMaxSequenceNumber, kTypeValue}, {"c", 0, kTypeValue})); + ASSERT_EQ("3,4", GetOverlappingFiles( + 1, {"d", 0, kTypeValue}, {"e", 0, kTypeValue})); + ASSERT_EQ("3", GetOverlappingFiles( + 1, {"d", 0, kTypeValue}, {"e", kMaxSequenceNumber, kTypeRangeDeletion})); + ASSERT_EQ("3,4", GetOverlappingFiles( + 1, {"e", kMaxSequenceNumber, kTypeValue}, {"f", 0, kTypeValue})); + ASSERT_EQ("3,4", GetOverlappingFiles( + 1, {"e", 0, kTypeValue}, {"f", 0, kTypeValue})); + ASSERT_EQ("5", GetOverlappingFiles( + 1, {"g", 0, kTypeValue}, {"h", 0, kTypeValue})); + ASSERT_EQ("6", GetOverlappingFiles( + 1, {"i", 0, kTypeValue}, {"j", 0, kTypeValue})); +} + + +class FindLevelFileTest : public testing::Test { + public: + LevelFilesBrief file_level_; + bool disjoint_sorted_files_; + Arena arena_; + + FindLevelFileTest() : disjoint_sorted_files_(true) { } + + ~FindLevelFileTest() override {} + + void LevelFileInit(size_t num = 0) { + char* mem = arena_.AllocateAligned(num * sizeof(FdWithKeyRange)); + file_level_.files = new (mem)FdWithKeyRange[num]; + file_level_.num_files = 0; + } + + void Add(const char* smallest, const char* largest, + SequenceNumber smallest_seq = 100, + SequenceNumber largest_seq = 100) { + InternalKey smallest_key = InternalKey(smallest, smallest_seq, kTypeValue); + InternalKey largest_key = InternalKey(largest, largest_seq, kTypeValue); + + Slice smallest_slice = smallest_key.Encode(); + Slice largest_slice = largest_key.Encode(); + + char* mem = arena_.AllocateAligned( + smallest_slice.size() + largest_slice.size()); + memcpy(mem, smallest_slice.data(), smallest_slice.size()); + memcpy(mem + smallest_slice.size(), largest_slice.data(), + largest_slice.size()); + + // add to file_level_ + size_t num = file_level_.num_files; + auto& file = file_level_.files[num]; + file.fd = FileDescriptor(num + 1, 0, 0); + file.smallest_key = Slice(mem, smallest_slice.size()); + file.largest_key = Slice(mem + smallest_slice.size(), + largest_slice.size()); + file_level_.num_files++; + } + + int Find(const char* key) { + InternalKey target(key, 100, kTypeValue); + InternalKeyComparator cmp(BytewiseComparator()); + return FindFile(cmp, file_level_, target.Encode()); + } + + bool Overlaps(const char* smallest, const char* largest) { + InternalKeyComparator cmp(BytewiseComparator()); + Slice s(smallest != nullptr ? smallest : ""); + Slice l(largest != nullptr ? largest : ""); + return SomeFileOverlapsRange(cmp, disjoint_sorted_files_, file_level_, + (smallest != nullptr ? &s : nullptr), + (largest != nullptr ? &l : nullptr)); + } +}; + +TEST_F(FindLevelFileTest, LevelEmpty) { + LevelFileInit(0); + + ASSERT_EQ(0, Find("foo")); + ASSERT_TRUE(! Overlaps("a", "z")); + ASSERT_TRUE(! Overlaps(nullptr, "z")); + ASSERT_TRUE(! Overlaps("a", nullptr)); + ASSERT_TRUE(! Overlaps(nullptr, nullptr)); +} + +TEST_F(FindLevelFileTest, LevelSingle) { + LevelFileInit(1); + + Add("p", "q"); + ASSERT_EQ(0, Find("a")); + ASSERT_EQ(0, Find("p")); + ASSERT_EQ(0, Find("p1")); + ASSERT_EQ(0, Find("q")); + ASSERT_EQ(1, Find("q1")); + ASSERT_EQ(1, Find("z")); + + ASSERT_TRUE(! Overlaps("a", "b")); + ASSERT_TRUE(! Overlaps("z1", "z2")); + ASSERT_TRUE(Overlaps("a", "p")); + ASSERT_TRUE(Overlaps("a", "q")); + ASSERT_TRUE(Overlaps("a", "z")); + ASSERT_TRUE(Overlaps("p", "p1")); + ASSERT_TRUE(Overlaps("p", "q")); + ASSERT_TRUE(Overlaps("p", "z")); + ASSERT_TRUE(Overlaps("p1", "p2")); + ASSERT_TRUE(Overlaps("p1", "z")); + ASSERT_TRUE(Overlaps("q", "q")); + ASSERT_TRUE(Overlaps("q", "q1")); + + ASSERT_TRUE(! Overlaps(nullptr, "j")); + ASSERT_TRUE(! Overlaps("r", nullptr)); + ASSERT_TRUE(Overlaps(nullptr, "p")); + ASSERT_TRUE(Overlaps(nullptr, "p1")); + ASSERT_TRUE(Overlaps("q", nullptr)); + ASSERT_TRUE(Overlaps(nullptr, nullptr)); +} + +TEST_F(FindLevelFileTest, LevelMultiple) { + LevelFileInit(4); + + Add("150", "200"); + Add("200", "250"); + Add("300", "350"); + Add("400", "450"); + ASSERT_EQ(0, Find("100")); + ASSERT_EQ(0, Find("150")); + ASSERT_EQ(0, Find("151")); + ASSERT_EQ(0, Find("199")); + ASSERT_EQ(0, Find("200")); + ASSERT_EQ(1, Find("201")); + ASSERT_EQ(1, Find("249")); + ASSERT_EQ(1, Find("250")); + ASSERT_EQ(2, Find("251")); + ASSERT_EQ(2, Find("299")); + ASSERT_EQ(2, Find("300")); + ASSERT_EQ(2, Find("349")); + ASSERT_EQ(2, Find("350")); + ASSERT_EQ(3, Find("351")); + ASSERT_EQ(3, Find("400")); + ASSERT_EQ(3, Find("450")); + ASSERT_EQ(4, Find("451")); + + ASSERT_TRUE(! Overlaps("100", "149")); + ASSERT_TRUE(! Overlaps("251", "299")); + ASSERT_TRUE(! Overlaps("451", "500")); + ASSERT_TRUE(! Overlaps("351", "399")); + + ASSERT_TRUE(Overlaps("100", "150")); + ASSERT_TRUE(Overlaps("100", "200")); + ASSERT_TRUE(Overlaps("100", "300")); + ASSERT_TRUE(Overlaps("100", "400")); + ASSERT_TRUE(Overlaps("100", "500")); + ASSERT_TRUE(Overlaps("375", "400")); + ASSERT_TRUE(Overlaps("450", "450")); + ASSERT_TRUE(Overlaps("450", "500")); +} + +TEST_F(FindLevelFileTest, LevelMultipleNullBoundaries) { + LevelFileInit(4); + + Add("150", "200"); + Add("200", "250"); + Add("300", "350"); + Add("400", "450"); + ASSERT_TRUE(! Overlaps(nullptr, "149")); + ASSERT_TRUE(! Overlaps("451", nullptr)); + ASSERT_TRUE(Overlaps(nullptr, nullptr)); + ASSERT_TRUE(Overlaps(nullptr, "150")); + ASSERT_TRUE(Overlaps(nullptr, "199")); + ASSERT_TRUE(Overlaps(nullptr, "200")); + ASSERT_TRUE(Overlaps(nullptr, "201")); + ASSERT_TRUE(Overlaps(nullptr, "400")); + ASSERT_TRUE(Overlaps(nullptr, "800")); + ASSERT_TRUE(Overlaps("100", nullptr)); + ASSERT_TRUE(Overlaps("200", nullptr)); + ASSERT_TRUE(Overlaps("449", nullptr)); + ASSERT_TRUE(Overlaps("450", nullptr)); +} + +TEST_F(FindLevelFileTest, LevelOverlapSequenceChecks) { + LevelFileInit(1); + + Add("200", "200", 5000, 3000); + ASSERT_TRUE(! Overlaps("199", "199")); + ASSERT_TRUE(! Overlaps("201", "300")); + ASSERT_TRUE(Overlaps("200", "200")); + ASSERT_TRUE(Overlaps("190", "200")); + ASSERT_TRUE(Overlaps("200", "210")); +} + +TEST_F(FindLevelFileTest, LevelOverlappingFiles) { + LevelFileInit(2); + + Add("150", "600"); + Add("400", "500"); + disjoint_sorted_files_ = false; + ASSERT_TRUE(! Overlaps("100", "149")); + ASSERT_TRUE(! Overlaps("601", "700")); + ASSERT_TRUE(Overlaps("100", "150")); + ASSERT_TRUE(Overlaps("100", "200")); + ASSERT_TRUE(Overlaps("100", "300")); + ASSERT_TRUE(Overlaps("100", "400")); + ASSERT_TRUE(Overlaps("100", "500")); + ASSERT_TRUE(Overlaps("375", "400")); + ASSERT_TRUE(Overlaps("450", "450")); + ASSERT_TRUE(Overlaps("450", "500")); + ASSERT_TRUE(Overlaps("450", "700")); + ASSERT_TRUE(Overlaps("600", "700")); +} + +class VersionSetTestBase { + public: + const static std::string kColumnFamilyName1; + const static std::string kColumnFamilyName2; + const static std::string kColumnFamilyName3; + int num_initial_edits_; + + VersionSetTestBase() + : env_(Env::Default()), + fs_(std::make_shared(env_)), + dbname_(test::PerThreadDBPath("version_set_test")), + db_options_(), + mutable_cf_options_(cf_options_), + table_cache_(NewLRUCache(50000, 16)), + write_buffer_manager_(db_options_.db_write_buffer_size), + shutting_down_(false), + mock_table_factory_(std::make_shared()) { + EXPECT_OK(env_->CreateDirIfMissing(dbname_)); + + db_options_.env = env_; + db_options_.fs = fs_; + versions_.reset(new VersionSet(dbname_, &db_options_, env_options_, + table_cache_.get(), &write_buffer_manager_, + &write_controller_, + /*block_cache_tracer=*/nullptr)), + reactive_versions_ = std::make_shared( + dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_); + db_options_.db_paths.emplace_back(dbname_, + std::numeric_limits::max()); + } + + void PrepareManifest(std::vector* column_families, + SequenceNumber* last_seqno, + std::unique_ptr* log_writer) { + assert(column_families != nullptr); + assert(last_seqno != nullptr); + assert(log_writer != nullptr); + VersionEdit new_db; + if (db_options_.write_dbid_to_manifest) { + DBImpl* impl = new DBImpl(DBOptions(), dbname_); + std::string db_id; + impl->GetDbIdentityFromIdentityFile(&db_id); + new_db.SetDBId(db_id); + } + new_db.SetLogNumber(0); + new_db.SetNextFile(2); + new_db.SetLastSequence(0); + + const std::vector cf_names = { + kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2, + kColumnFamilyName3}; + const int kInitialNumOfCfs = static_cast(cf_names.size()); + autovector new_cfs; + uint64_t last_seq = 1; + uint32_t cf_id = 1; + for (int i = 1; i != kInitialNumOfCfs; ++i) { + VersionEdit new_cf; + new_cf.AddColumnFamily(cf_names[i]); + new_cf.SetColumnFamily(cf_id++); + new_cf.SetLogNumber(0); + new_cf.SetNextFile(2); + new_cf.SetLastSequence(last_seq++); + new_cfs.emplace_back(new_cf); + } + *last_seqno = last_seq; + num_initial_edits_ = static_cast(new_cfs.size() + 1); + const std::string manifest = DescriptorFileName(dbname_, 1); + std::unique_ptr file; + Status s = env_->NewWritableFile( + manifest, &file, env_->OptimizeForManifestWrite(env_options_)); + ASSERT_OK(s); + std::unique_ptr file_writer(new WritableFileWriter( + NewLegacyWritableFileWrapper(std::move(file)), manifest, env_options_)); + { + log_writer->reset(new log::Writer(std::move(file_writer), 0, false)); + std::string record; + new_db.EncodeTo(&record); + s = (*log_writer)->AddRecord(record); + for (const auto& e : new_cfs) { + record.clear(); + e.EncodeTo(&record); + s = (*log_writer)->AddRecord(record); + ASSERT_OK(s); + } + } + ASSERT_OK(s); + + cf_options_.table_factory = mock_table_factory_; + for (const auto& cf_name : cf_names) { + column_families->emplace_back(cf_name, cf_options_); + } + } + + // Create DB with 3 column families. + void NewDB() { + std::vector column_families; + SequenceNumber last_seqno; + std::unique_ptr log_writer; + SetIdentityFile(env_, dbname_); + PrepareManifest(&column_families, &last_seqno, &log_writer); + log_writer.reset(); + // Make "CURRENT" file point to the new manifest file. + Status s = SetCurrentFile(env_, dbname_, 1, nullptr); + ASSERT_OK(s); + + EXPECT_OK(versions_->Recover(column_families, false)); + EXPECT_EQ(column_families.size(), + versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + } + + Env* env_; + std::shared_ptr fs_; + const std::string dbname_; + EnvOptions env_options_; + ImmutableDBOptions db_options_; + ColumnFamilyOptions cf_options_; + MutableCFOptions mutable_cf_options_; + std::shared_ptr table_cache_; + WriteController write_controller_; + WriteBufferManager write_buffer_manager_; + std::shared_ptr versions_; + std::shared_ptr reactive_versions_; + InstrumentedMutex mutex_; + std::atomic shutting_down_; + std::shared_ptr mock_table_factory_; +}; + +const std::string VersionSetTestBase::kColumnFamilyName1 = "alice"; +const std::string VersionSetTestBase::kColumnFamilyName2 = "bob"; +const std::string VersionSetTestBase::kColumnFamilyName3 = "charles"; + +class VersionSetTest : public VersionSetTestBase, public testing::Test { + public: + VersionSetTest() : VersionSetTestBase() {} +}; + +TEST_F(VersionSetTest, SameColumnFamilyGroupCommit) { + NewDB(); + const int kGroupSize = 5; + autovector edits; + for (int i = 0; i != kGroupSize; ++i) { + edits.emplace_back(VersionEdit()); + } + autovector cfds; + autovector all_mutable_cf_options; + autovector> edit_lists; + for (int i = 0; i != kGroupSize; ++i) { + cfds.emplace_back(versions_->GetColumnFamilySet()->GetDefault()); + all_mutable_cf_options.emplace_back(&mutable_cf_options_); + autovector edit_list; + edit_list.emplace_back(&edits[i]); + edit_lists.emplace_back(edit_list); + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + int count = 0; + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::ProcessManifestWrites:SameColumnFamily", [&](void* arg) { + uint32_t* cf_id = reinterpret_cast(arg); + EXPECT_EQ(0u, *cf_id); + ++count; + }); + SyncPoint::GetInstance()->EnableProcessing(); + mutex_.Lock(); + Status s = + versions_->LogAndApply(cfds, all_mutable_cf_options, edit_lists, &mutex_); + mutex_.Unlock(); + EXPECT_OK(s); + EXPECT_EQ(kGroupSize - 1, count); +} + +class VersionSetAtomicGroupTest : public VersionSetTestBase, + public testing::Test { + public: + VersionSetAtomicGroupTest() : VersionSetTestBase() {} + + void SetUp() override { + PrepareManifest(&column_families_, &last_seqno_, &log_writer_); + SetupTestSyncPoints(); + } + + void SetupValidAtomicGroup(int atomic_group_size) { + edits_.resize(atomic_group_size); + int remaining = atomic_group_size; + for (size_t i = 0; i != edits_.size(); ++i) { + edits_[i].SetLogNumber(0); + edits_[i].SetNextFile(2); + edits_[i].MarkAtomicGroup(--remaining); + edits_[i].SetLastSequence(last_seqno_++); + } + ASSERT_OK(SetCurrentFile(env_, dbname_, 1, nullptr)); + } + + void SetupIncompleteTrailingAtomicGroup(int atomic_group_size) { + edits_.resize(atomic_group_size); + int remaining = atomic_group_size; + for (size_t i = 0; i != edits_.size(); ++i) { + edits_[i].SetLogNumber(0); + edits_[i].SetNextFile(2); + edits_[i].MarkAtomicGroup(--remaining); + edits_[i].SetLastSequence(last_seqno_++); + } + ASSERT_OK(SetCurrentFile(env_, dbname_, 1, nullptr)); + } + + void SetupCorruptedAtomicGroup(int atomic_group_size) { + edits_.resize(atomic_group_size); + int remaining = atomic_group_size; + for (size_t i = 0; i != edits_.size(); ++i) { + edits_[i].SetLogNumber(0); + edits_[i].SetNextFile(2); + if (i != ((size_t)atomic_group_size / 2)) { + edits_[i].MarkAtomicGroup(--remaining); + } + edits_[i].SetLastSequence(last_seqno_++); + } + ASSERT_OK(SetCurrentFile(env_, dbname_, 1, nullptr)); + } + + void SetupIncorrectAtomicGroup(int atomic_group_size) { + edits_.resize(atomic_group_size); + int remaining = atomic_group_size; + for (size_t i = 0; i != edits_.size(); ++i) { + edits_[i].SetLogNumber(0); + edits_[i].SetNextFile(2); + if (i != 1) { + edits_[i].MarkAtomicGroup(--remaining); + } else { + edits_[i].MarkAtomicGroup(remaining--); + } + edits_[i].SetLastSequence(last_seqno_++); + } + ASSERT_OK(SetCurrentFile(env_, dbname_, 1, nullptr)); + } + + void SetupTestSyncPoints() { + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "AtomicGroupReadBuffer::AddEdit:FirstInAtomicGroup", [&](void* arg) { + VersionEdit* e = reinterpret_cast(arg); + EXPECT_EQ(edits_.front().DebugString(), + e->DebugString()); // compare based on value + first_in_atomic_group_ = true; + }); + SyncPoint::GetInstance()->SetCallBack( + "AtomicGroupReadBuffer::AddEdit:LastInAtomicGroup", [&](void* arg) { + VersionEdit* e = reinterpret_cast(arg); + EXPECT_EQ(edits_.back().DebugString(), + e->DebugString()); // compare based on value + EXPECT_TRUE(first_in_atomic_group_); + last_in_atomic_group_ = true; + }); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::ReadAndRecover:RecoveredEdits", [&](void* arg) { + num_recovered_edits_ = *reinterpret_cast(arg); + }); + SyncPoint::GetInstance()->SetCallBack( + "ReactiveVersionSet::ReadAndApply:AppliedEdits", + [&](void* arg) { num_applied_edits_ = *reinterpret_cast(arg); }); + SyncPoint::GetInstance()->SetCallBack( + "AtomicGroupReadBuffer::AddEdit:AtomicGroup", + [&](void* /* arg */) { ++num_edits_in_atomic_group_; }); + SyncPoint::GetInstance()->SetCallBack( + "AtomicGroupReadBuffer::AddEdit:AtomicGroupMixedWithNormalEdits", + [&](void* arg) { + corrupted_edit_ = *reinterpret_cast(arg); + }); + SyncPoint::GetInstance()->SetCallBack( + "AtomicGroupReadBuffer::AddEdit:IncorrectAtomicGroupSize", + [&](void* arg) { + edit_with_incorrect_group_size_ = + *reinterpret_cast(arg); + }); + SyncPoint::GetInstance()->EnableProcessing(); + } + + void AddNewEditsToLog(int num_edits) { + for (int i = 0; i < num_edits; i++) { + std::string record; + edits_[i].EncodeTo(&record); + ASSERT_OK(log_writer_->AddRecord(record)); + } + } + + void TearDown() override { + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + log_writer_.reset(); + } + + protected: + std::vector column_families_; + SequenceNumber last_seqno_; + std::vector edits_; + bool first_in_atomic_group_ = false; + bool last_in_atomic_group_ = false; + int num_edits_in_atomic_group_ = 0; + int num_recovered_edits_ = 0; + int num_applied_edits_ = 0; + VersionEdit corrupted_edit_; + VersionEdit edit_with_incorrect_group_size_; + std::unique_ptr log_writer_; +}; + +TEST_F(VersionSetAtomicGroupTest, HandleValidAtomicGroupWithVersionSetRecover) { + const int kAtomicGroupSize = 3; + SetupValidAtomicGroup(kAtomicGroupSize); + AddNewEditsToLog(kAtomicGroupSize); + EXPECT_OK(versions_->Recover(column_families_, false)); + EXPECT_EQ(column_families_.size(), + versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + EXPECT_TRUE(first_in_atomic_group_); + EXPECT_TRUE(last_in_atomic_group_); + EXPECT_EQ(num_initial_edits_ + kAtomicGroupSize, num_recovered_edits_); + EXPECT_EQ(0, num_applied_edits_); +} + +TEST_F(VersionSetAtomicGroupTest, + HandleValidAtomicGroupWithReactiveVersionSetRecover) { + const int kAtomicGroupSize = 3; + SetupValidAtomicGroup(kAtomicGroupSize); + AddNewEditsToLog(kAtomicGroupSize); + std::unique_ptr manifest_reader; + std::unique_ptr manifest_reporter; + std::unique_ptr manifest_reader_status; + EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader, + &manifest_reporter, + &manifest_reader_status)); + EXPECT_EQ(column_families_.size(), + reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + EXPECT_TRUE(first_in_atomic_group_); + EXPECT_TRUE(last_in_atomic_group_); + // The recover should clean up the replay buffer. + EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0); + EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0); + EXPECT_EQ(num_initial_edits_ + kAtomicGroupSize, num_recovered_edits_); + EXPECT_EQ(0, num_applied_edits_); +} + +TEST_F(VersionSetAtomicGroupTest, + HandleValidAtomicGroupWithReactiveVersionSetReadAndApply) { + const int kAtomicGroupSize = 3; + SetupValidAtomicGroup(kAtomicGroupSize); + std::unique_ptr manifest_reader; + std::unique_ptr manifest_reporter; + std::unique_ptr manifest_reader_status; + EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader, + &manifest_reporter, + &manifest_reader_status)); + AddNewEditsToLog(kAtomicGroupSize); + InstrumentedMutex mu; + std::unordered_set cfds_changed; + mu.Lock(); + EXPECT_OK( + reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed)); + mu.Unlock(); + EXPECT_TRUE(first_in_atomic_group_); + EXPECT_TRUE(last_in_atomic_group_); + // The recover should clean up the replay buffer. + EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0); + EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0); + EXPECT_EQ(num_initial_edits_, num_recovered_edits_); + EXPECT_EQ(kAtomicGroupSize, num_applied_edits_); +} + +TEST_F(VersionSetAtomicGroupTest, + HandleIncompleteTrailingAtomicGroupWithVersionSetRecover) { + const int kAtomicGroupSize = 4; + const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1; + SetupIncompleteTrailingAtomicGroup(kAtomicGroupSize); + AddNewEditsToLog(kNumberOfPersistedVersionEdits); + EXPECT_OK(versions_->Recover(column_families_, false)); + EXPECT_EQ(column_families_.size(), + versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + EXPECT_TRUE(first_in_atomic_group_); + EXPECT_FALSE(last_in_atomic_group_); + EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_); + EXPECT_EQ(num_initial_edits_, num_recovered_edits_); + EXPECT_EQ(0, num_applied_edits_); +} + +TEST_F(VersionSetAtomicGroupTest, + HandleIncompleteTrailingAtomicGroupWithReactiveVersionSetRecover) { + const int kAtomicGroupSize = 4; + const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1; + SetupIncompleteTrailingAtomicGroup(kAtomicGroupSize); + AddNewEditsToLog(kNumberOfPersistedVersionEdits); + std::unique_ptr manifest_reader; + std::unique_ptr manifest_reporter; + std::unique_ptr manifest_reader_status; + EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader, + &manifest_reporter, + &manifest_reader_status)); + EXPECT_EQ(column_families_.size(), + reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + EXPECT_TRUE(first_in_atomic_group_); + EXPECT_FALSE(last_in_atomic_group_); + EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_); + // Reactive version set should store the edits in the replay buffer. + EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == + kNumberOfPersistedVersionEdits); + EXPECT_TRUE(reactive_versions_->replay_buffer().size() == kAtomicGroupSize); + // Write the last record. The reactive version set should now apply all + // edits. + std::string last_record; + edits_[kAtomicGroupSize - 1].EncodeTo(&last_record); + EXPECT_OK(log_writer_->AddRecord(last_record)); + InstrumentedMutex mu; + std::unordered_set cfds_changed; + mu.Lock(); + EXPECT_OK( + reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed)); + mu.Unlock(); + // Reactive version set should be empty now. + EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0); + EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0); + EXPECT_EQ(num_initial_edits_, num_recovered_edits_); + EXPECT_EQ(kAtomicGroupSize, num_applied_edits_); +} + +TEST_F(VersionSetAtomicGroupTest, + HandleIncompleteTrailingAtomicGroupWithReactiveVersionSetReadAndApply) { + const int kAtomicGroupSize = 4; + const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1; + SetupIncompleteTrailingAtomicGroup(kAtomicGroupSize); + std::unique_ptr manifest_reader; + std::unique_ptr manifest_reporter; + std::unique_ptr manifest_reader_status; + // No edits in an atomic group. + EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader, + &manifest_reporter, + &manifest_reader_status)); + EXPECT_EQ(column_families_.size(), + reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + // Write a few edits in an atomic group. + AddNewEditsToLog(kNumberOfPersistedVersionEdits); + InstrumentedMutex mu; + std::unordered_set cfds_changed; + mu.Lock(); + EXPECT_OK( + reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed)); + mu.Unlock(); + EXPECT_TRUE(first_in_atomic_group_); + EXPECT_FALSE(last_in_atomic_group_); + EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_); + // Reactive version set should store the edits in the replay buffer. + EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == + kNumberOfPersistedVersionEdits); + EXPECT_TRUE(reactive_versions_->replay_buffer().size() == kAtomicGroupSize); + EXPECT_EQ(num_initial_edits_, num_recovered_edits_); + EXPECT_EQ(0, num_applied_edits_); +} + +TEST_F(VersionSetAtomicGroupTest, + HandleCorruptedAtomicGroupWithVersionSetRecover) { + const int kAtomicGroupSize = 4; + SetupCorruptedAtomicGroup(kAtomicGroupSize); + AddNewEditsToLog(kAtomicGroupSize); + EXPECT_NOK(versions_->Recover(column_families_, false)); + EXPECT_EQ(column_families_.size(), + versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(), + corrupted_edit_.DebugString()); +} + +TEST_F(VersionSetAtomicGroupTest, + HandleCorruptedAtomicGroupWithReactiveVersionSetRecover) { + const int kAtomicGroupSize = 4; + SetupCorruptedAtomicGroup(kAtomicGroupSize); + AddNewEditsToLog(kAtomicGroupSize); + std::unique_ptr manifest_reader; + std::unique_ptr manifest_reporter; + std::unique_ptr manifest_reader_status; + EXPECT_NOK(reactive_versions_->Recover(column_families_, &manifest_reader, + &manifest_reporter, + &manifest_reader_status)); + EXPECT_EQ(column_families_.size(), + reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(), + corrupted_edit_.DebugString()); +} + +TEST_F(VersionSetAtomicGroupTest, + HandleCorruptedAtomicGroupWithReactiveVersionSetReadAndApply) { + const int kAtomicGroupSize = 4; + SetupCorruptedAtomicGroup(kAtomicGroupSize); + InstrumentedMutex mu; + std::unordered_set cfds_changed; + std::unique_ptr manifest_reader; + std::unique_ptr manifest_reporter; + std::unique_ptr manifest_reader_status; + EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader, + &manifest_reporter, + &manifest_reader_status)); + // Write the corrupted edits. + AddNewEditsToLog(kAtomicGroupSize); + mu.Lock(); + EXPECT_OK( + reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed)); + mu.Unlock(); + EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(), + corrupted_edit_.DebugString()); +} + +TEST_F(VersionSetAtomicGroupTest, + HandleIncorrectAtomicGroupSizeWithVersionSetRecover) { + const int kAtomicGroupSize = 4; + SetupIncorrectAtomicGroup(kAtomicGroupSize); + AddNewEditsToLog(kAtomicGroupSize); + EXPECT_NOK(versions_->Recover(column_families_, false)); + EXPECT_EQ(column_families_.size(), + versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + EXPECT_EQ(edits_[1].DebugString(), + edit_with_incorrect_group_size_.DebugString()); +} + +TEST_F(VersionSetAtomicGroupTest, + HandleIncorrectAtomicGroupSizeWithReactiveVersionSetRecover) { + const int kAtomicGroupSize = 4; + SetupIncorrectAtomicGroup(kAtomicGroupSize); + AddNewEditsToLog(kAtomicGroupSize); + std::unique_ptr manifest_reader; + std::unique_ptr manifest_reporter; + std::unique_ptr manifest_reader_status; + EXPECT_NOK(reactive_versions_->Recover(column_families_, &manifest_reader, + &manifest_reporter, + &manifest_reader_status)); + EXPECT_EQ(column_families_.size(), + reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + EXPECT_EQ(edits_[1].DebugString(), + edit_with_incorrect_group_size_.DebugString()); +} + +TEST_F(VersionSetAtomicGroupTest, + HandleIncorrectAtomicGroupSizeWithReactiveVersionSetReadAndApply) { + const int kAtomicGroupSize = 4; + SetupIncorrectAtomicGroup(kAtomicGroupSize); + InstrumentedMutex mu; + std::unordered_set cfds_changed; + std::unique_ptr manifest_reader; + std::unique_ptr manifest_reporter; + std::unique_ptr manifest_reader_status; + EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader, + &manifest_reporter, + &manifest_reader_status)); + AddNewEditsToLog(kAtomicGroupSize); + mu.Lock(); + EXPECT_OK( + reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed)); + mu.Unlock(); + EXPECT_EQ(edits_[1].DebugString(), + edit_with_incorrect_group_size_.DebugString()); +} + +class VersionSetTestDropOneCF : public VersionSetTestBase, + public testing::TestWithParam { + public: + VersionSetTestDropOneCF() : VersionSetTestBase() {} +}; + +// This test simulates the following execution sequence +// Time thread1 bg_flush_thr +// | Prepare version edits (e1,e2,e3) for atomic +// | flush cf1, cf2, cf3 +// | Enqueue e to drop cfi +// | to manifest_writers_ +// | Enqueue (e1,e2,e3) to manifest_writers_ +// | +// | Apply e, +// | cfi.IsDropped() is true +// | Apply (e1,e2,e3), +// | since cfi.IsDropped() == true, we need to +// | drop ei and write the rest to MANIFEST. +// V +// +// Repeat the test for i = 1, 2, 3 to simulate dropping the first, middle and +// last column family in an atomic group. +TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) { + std::vector column_families; + SequenceNumber last_seqno; + std::unique_ptr log_writer; + PrepareManifest(&column_families, &last_seqno, &log_writer); + Status s = SetCurrentFile(env_, dbname_, 1, nullptr); + ASSERT_OK(s); + + EXPECT_OK(versions_->Recover(column_families, false /* read_only */)); + EXPECT_EQ(column_families.size(), + versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + + const int kAtomicGroupSize = 3; + const std::vector non_default_cf_names = { + kColumnFamilyName1, kColumnFamilyName2, kColumnFamilyName3}; + + // Drop one column family + VersionEdit drop_cf_edit; + drop_cf_edit.DropColumnFamily(); + const std::string cf_to_drop_name(GetParam()); + auto cfd_to_drop = + versions_->GetColumnFamilySet()->GetColumnFamily(cf_to_drop_name); + ASSERT_NE(nullptr, cfd_to_drop); + // Increase its refcount because cfd_to_drop is used later, and we need to + // prevent it from being deleted. + cfd_to_drop->Ref(); + drop_cf_edit.SetColumnFamily(cfd_to_drop->GetID()); + mutex_.Lock(); + s = versions_->LogAndApply(cfd_to_drop, + *cfd_to_drop->GetLatestMutableCFOptions(), + &drop_cf_edit, &mutex_); + mutex_.Unlock(); + ASSERT_OK(s); + + std::vector edits(kAtomicGroupSize); + uint32_t remaining = kAtomicGroupSize; + size_t i = 0; + autovector cfds; + autovector mutable_cf_options_list; + autovector> edit_lists; + for (const auto& cf_name : non_default_cf_names) { + auto cfd = (cf_name != cf_to_drop_name) + ? versions_->GetColumnFamilySet()->GetColumnFamily(cf_name) + : cfd_to_drop; + ASSERT_NE(nullptr, cfd); + cfds.push_back(cfd); + mutable_cf_options_list.emplace_back(cfd->GetLatestMutableCFOptions()); + edits[i].SetColumnFamily(cfd->GetID()); + edits[i].SetLogNumber(0); + edits[i].SetNextFile(2); + edits[i].MarkAtomicGroup(--remaining); + edits[i].SetLastSequence(last_seqno++); + autovector tmp_edits; + tmp_edits.push_back(&edits[i]); + edit_lists.emplace_back(tmp_edits); + ++i; + } + int called = 0; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::ProcessManifestWrites:CheckOneAtomicGroup", [&](void* arg) { + std::vector* tmp_edits = + reinterpret_cast*>(arg); + EXPECT_EQ(kAtomicGroupSize - 1, tmp_edits->size()); + for (const auto e : *tmp_edits) { + bool found = false; + for (const auto& e2 : edits) { + if (&e2 == e) { + found = true; + break; + } + } + ASSERT_TRUE(found); + } + ++called; + }); + SyncPoint::GetInstance()->EnableProcessing(); + mutex_.Lock(); + s = versions_->LogAndApply(cfds, mutable_cf_options_list, edit_lists, + &mutex_); + mutex_.Unlock(); + ASSERT_OK(s); + ASSERT_EQ(1, called); + if (cfd_to_drop->Unref()) { + delete cfd_to_drop; + cfd_to_drop = nullptr; + } +} + +INSTANTIATE_TEST_CASE_P( + AtomicGroup, VersionSetTestDropOneCF, + testing::Values(VersionSetTestBase::kColumnFamilyName1, + VersionSetTestBase::kColumnFamilyName2, + VersionSetTestBase::kColumnFamilyName3)); +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/wal_manager.cc b/src/rocksdb/db/wal_manager.cc new file mode 100644 index 000000000..5b699274c --- /dev/null +++ b/src/rocksdb/db/wal_manager.cc @@ -0,0 +1,510 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/wal_manager.h" + +#include +#include +#include +#include + +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/transaction_log_impl.h" +#include "db/write_batch_internal.h" +#include "file/file_util.h" +#include "file/filename.h" +#include "file/sequence_file_reader.h" +#include "logging/logging.h" +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/write_batch.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" +#include "util/coding.h" +#include "util/mutexlock.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +#ifndef ROCKSDB_LITE + +Status WalManager::DeleteFile(const std::string& fname, uint64_t number) { + auto s = env_->DeleteFile(db_options_.wal_dir + "/" + fname); + if (s.ok()) { + MutexLock l(&read_first_record_cache_mutex_); + read_first_record_cache_.erase(number); + } + return s; +} + +Status WalManager::GetSortedWalFiles(VectorLogPtr& files) { + // First get sorted files in db dir, then get sorted files from archived + // dir, to avoid a race condition where a log file is moved to archived + // dir in between. + Status s; + // list wal files in main db dir. + VectorLogPtr logs; + s = GetSortedWalsOfType(db_options_.wal_dir, logs, kAliveLogFile); + if (!s.ok()) { + return s; + } + + // Reproduce the race condition where a log file is moved + // to archived dir, between these two sync points, used in + // (DBTest,TransactionLogIteratorRace) + TEST_SYNC_POINT("WalManager::GetSortedWalFiles:1"); + TEST_SYNC_POINT("WalManager::GetSortedWalFiles:2"); + + files.clear(); + // list wal files in archive dir. + std::string archivedir = ArchivalDirectory(db_options_.wal_dir); + Status exists = env_->FileExists(archivedir); + if (exists.ok()) { + s = GetSortedWalsOfType(archivedir, files, kArchivedLogFile); + if (!s.ok()) { + return s; + } + } else if (!exists.IsNotFound()) { + assert(s.IsIOError()); + return s; + } + + uint64_t latest_archived_log_number = 0; + if (!files.empty()) { + latest_archived_log_number = files.back()->LogNumber(); + ROCKS_LOG_INFO(db_options_.info_log, "Latest Archived log: %" PRIu64, + latest_archived_log_number); + } + + files.reserve(files.size() + logs.size()); + for (auto& log : logs) { + if (log->LogNumber() > latest_archived_log_number) { + files.push_back(std::move(log)); + } else { + // When the race condition happens, we could see the + // same log in both db dir and archived dir. Simply + // ignore the one in db dir. Note that, if we read + // archived dir first, we would have missed the log file. + ROCKS_LOG_WARN(db_options_.info_log, "%s already moved to archive", + log->PathName().c_str()); + } + } + + return s; +} + +Status WalManager::GetUpdatesSince( + SequenceNumber seq, std::unique_ptr* iter, + const TransactionLogIterator::ReadOptions& read_options, + VersionSet* version_set) { + + // Get all sorted Wal Files. + // Do binary search and open files and find the seq number. + + std::unique_ptr wal_files(new VectorLogPtr); + Status s = GetSortedWalFiles(*wal_files); + if (!s.ok()) { + return s; + } + + s = RetainProbableWalFiles(*wal_files, seq); + if (!s.ok()) { + return s; + } + iter->reset(new TransactionLogIteratorImpl( + db_options_.wal_dir, &db_options_, read_options, file_options_, seq, + std::move(wal_files), version_set, seq_per_batch_)); + return (*iter)->status(); +} + +// 1. Go through all archived files and +// a. if ttl is enabled, delete outdated files +// b. if archive size limit is enabled, delete empty files, +// compute file number and size. +// 2. If size limit is enabled: +// a. compute how many files should be deleted +// b. get sorted non-empty archived logs +// c. delete what should be deleted +void WalManager::PurgeObsoleteWALFiles() { + bool const ttl_enabled = db_options_.wal_ttl_seconds > 0; + bool const size_limit_enabled = db_options_.wal_size_limit_mb > 0; + if (!ttl_enabled && !size_limit_enabled) { + return; + } + + int64_t current_time; + Status s = env_->GetCurrentTime(¤t_time); + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, "Can't get current time: %s", + s.ToString().c_str()); + assert(false); + return; + } + uint64_t const now_seconds = static_cast(current_time); + uint64_t const time_to_check = (ttl_enabled && !size_limit_enabled) + ? db_options_.wal_ttl_seconds / 2 + : kDefaultIntervalToDeleteObsoleteWAL; + + if (purge_wal_files_last_run_ + time_to_check > now_seconds) { + return; + } + + purge_wal_files_last_run_ = now_seconds; + + std::string archival_dir = ArchivalDirectory(db_options_.wal_dir); + std::vector files; + s = env_->GetChildren(archival_dir, &files); + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, "Can't get archive files: %s", + s.ToString().c_str()); + assert(false); + return; + } + + size_t log_files_num = 0; + uint64_t log_file_size = 0; + + for (auto& f : files) { + uint64_t number; + FileType type; + if (ParseFileName(f, &number, &type) && type == kLogFile) { + std::string const file_path = archival_dir + "/" + f; + if (ttl_enabled) { + uint64_t file_m_time; + s = env_->GetFileModificationTime(file_path, &file_m_time); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Can't get file mod time: %s: %s", file_path.c_str(), + s.ToString().c_str()); + continue; + } + if (now_seconds - file_m_time > db_options_.wal_ttl_seconds) { + s = DeleteDBFile(&db_options_, file_path, archival_dir, false, + /*force_fg=*/!wal_in_db_path_); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, "Can't delete file: %s: %s", + file_path.c_str(), s.ToString().c_str()); + continue; + } else { + MutexLock l(&read_first_record_cache_mutex_); + read_first_record_cache_.erase(number); + } + continue; + } + } + + if (size_limit_enabled) { + uint64_t file_size; + s = env_->GetFileSize(file_path, &file_size); + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, + "Unable to get file size: %s: %s", file_path.c_str(), + s.ToString().c_str()); + return; + } else { + if (file_size > 0) { + log_file_size = std::max(log_file_size, file_size); + ++log_files_num; + } else { + s = DeleteDBFile(&db_options_, file_path, archival_dir, false, + /*force_fg=*/!wal_in_db_path_); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Unable to delete file: %s: %s", file_path.c_str(), + s.ToString().c_str()); + continue; + } else { + MutexLock l(&read_first_record_cache_mutex_); + read_first_record_cache_.erase(number); + } + } + } + } + } + } + + if (0 == log_files_num || !size_limit_enabled) { + return; + } + + size_t const files_keep_num = + static_cast(db_options_.wal_size_limit_mb * 1024 * 1024 / log_file_size); + if (log_files_num <= files_keep_num) { + return; + } + + size_t files_del_num = log_files_num - files_keep_num; + VectorLogPtr archived_logs; + GetSortedWalsOfType(archival_dir, archived_logs, kArchivedLogFile); + + if (files_del_num > archived_logs.size()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Trying to delete more archived log files than " + "exist. Deleting all"); + files_del_num = archived_logs.size(); + } + + for (size_t i = 0; i < files_del_num; ++i) { + std::string const file_path = archived_logs[i]->PathName(); + s = DeleteDBFile(&db_options_, db_options_.wal_dir + "/" + file_path, + db_options_.wal_dir, false, + /*force_fg=*/!wal_in_db_path_); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, "Unable to delete file: %s: %s", + file_path.c_str(), s.ToString().c_str()); + continue; + } else { + MutexLock l(&read_first_record_cache_mutex_); + read_first_record_cache_.erase(archived_logs[i]->LogNumber()); + } + } +} + +void WalManager::ArchiveWALFile(const std::string& fname, uint64_t number) { + auto archived_log_name = ArchivedLogFileName(db_options_.wal_dir, number); + // The sync point below is used in (DBTest,TransactionLogIteratorRace) + TEST_SYNC_POINT("WalManager::PurgeObsoleteFiles:1"); + Status s = env_->RenameFile(fname, archived_log_name); + // The sync point below is used in (DBTest,TransactionLogIteratorRace) + TEST_SYNC_POINT("WalManager::PurgeObsoleteFiles:2"); + ROCKS_LOG_INFO(db_options_.info_log, "Move log file %s to %s -- %s\n", + fname.c_str(), archived_log_name.c_str(), + s.ToString().c_str()); +} + +Status WalManager::GetSortedWalsOfType(const std::string& path, + VectorLogPtr& log_files, + WalFileType log_type) { + std::vector all_files; + const Status status = env_->GetChildren(path, &all_files); + if (!status.ok()) { + return status; + } + log_files.reserve(all_files.size()); + for (const auto& f : all_files) { + uint64_t number; + FileType type; + if (ParseFileName(f, &number, &type) && type == kLogFile) { + SequenceNumber sequence; + Status s = ReadFirstRecord(log_type, number, &sequence); + if (!s.ok()) { + return s; + } + if (sequence == 0) { + // empty file + continue; + } + + // Reproduce the race condition where a log file is moved + // to archived dir, between these two sync points, used in + // (DBTest,TransactionLogIteratorRace) + TEST_SYNC_POINT("WalManager::GetSortedWalsOfType:1"); + TEST_SYNC_POINT("WalManager::GetSortedWalsOfType:2"); + + uint64_t size_bytes; + s = env_->GetFileSize(LogFileName(path, number), &size_bytes); + // re-try in case the alive log file has been moved to archive. + if (!s.ok() && log_type == kAliveLogFile) { + std::string archived_file = ArchivedLogFileName(path, number); + if (env_->FileExists(archived_file).ok()) { + s = env_->GetFileSize(archived_file, &size_bytes); + if (!s.ok() && env_->FileExists(archived_file).IsNotFound()) { + // oops, the file just got deleted from archived dir! move on + s = Status::OK(); + continue; + } + } + } + if (!s.ok()) { + return s; + } + + log_files.push_back(std::unique_ptr( + new LogFileImpl(number, log_type, sequence, size_bytes))); + } + } + std::sort( + log_files.begin(), log_files.end(), + [](const std::unique_ptr& a, const std::unique_ptr& b) { + LogFileImpl* a_impl = + static_cast_with_check(a.get()); + LogFileImpl* b_impl = + static_cast_with_check(b.get()); + return *a_impl < *b_impl; + }); + return status; +} + +Status WalManager::RetainProbableWalFiles(VectorLogPtr& all_logs, + const SequenceNumber target) { + int64_t start = 0; // signed to avoid overflow when target is < first file. + int64_t end = static_cast(all_logs.size()) - 1; + // Binary Search. avoid opening all files. + while (end >= start) { + int64_t mid = start + (end - start) / 2; // Avoid overflow. + SequenceNumber current_seq_num = all_logs.at(static_cast(mid))->StartSequence(); + if (current_seq_num == target) { + end = mid; + break; + } else if (current_seq_num < target) { + start = mid + 1; + } else { + end = mid - 1; + } + } + // end could be -ve. + size_t start_index = static_cast(std::max(static_cast(0), end)); + // The last wal file is always included + all_logs.erase(all_logs.begin(), all_logs.begin() + start_index); + return Status::OK(); +} + +Status WalManager::ReadFirstRecord(const WalFileType type, + const uint64_t number, + SequenceNumber* sequence) { + *sequence = 0; + if (type != kAliveLogFile && type != kArchivedLogFile) { + ROCKS_LOG_ERROR(db_options_.info_log, "[WalManger] Unknown file type %s", + ToString(type).c_str()); + return Status::NotSupported( + "File Type Not Known " + ToString(type)); + } + { + MutexLock l(&read_first_record_cache_mutex_); + auto itr = read_first_record_cache_.find(number); + if (itr != read_first_record_cache_.end()) { + *sequence = itr->second; + return Status::OK(); + } + } + Status s; + if (type == kAliveLogFile) { + std::string fname = LogFileName(db_options_.wal_dir, number); + s = ReadFirstLine(fname, number, sequence); + if (!s.ok() && env_->FileExists(fname).ok()) { + // return any error that is not caused by non-existing file + return s; + } + } + + if (type == kArchivedLogFile || !s.ok()) { + // check if the file got moved to archive. + std::string archived_file = + ArchivedLogFileName(db_options_.wal_dir, number); + s = ReadFirstLine(archived_file, number, sequence); + // maybe the file was deleted from archive dir. If that's the case, return + // Status::OK(). The caller with identify this as empty file because + // *sequence == 0 + if (!s.ok() && env_->FileExists(archived_file).IsNotFound()) { + return Status::OK(); + } + } + + if (s.ok() && *sequence != 0) { + MutexLock l(&read_first_record_cache_mutex_); + read_first_record_cache_.insert({number, *sequence}); + } + return s; +} + +Status WalManager::GetLiveWalFile(uint64_t number, + std::unique_ptr* log_file) { + if (!log_file) { + return Status::InvalidArgument("log_file not preallocated."); + } + + if (!number) { + return Status::PathNotFound("log file not available"); + } + + Status s; + + uint64_t size_bytes; + s = env_->GetFileSize(LogFileName(db_options_.wal_dir, number), &size_bytes); + + if (!s.ok()) { + return s; + } + + log_file->reset(new LogFileImpl(number, kAliveLogFile, + 0, // SequenceNumber + size_bytes)); + + return Status::OK(); +} + +// the function returns status.ok() and sequence == 0 if the file exists, but is +// empty +Status WalManager::ReadFirstLine(const std::string& fname, + const uint64_t number, + SequenceNumber* sequence) { + struct LogReporter : public log::Reader::Reporter { + Env* env; + Logger* info_log; + const char* fname; + + Status* status; + bool ignore_error; // true if db_options_.paranoid_checks==false + void Corruption(size_t bytes, const Status& s) override { + ROCKS_LOG_WARN(info_log, "[WalManager] %s%s: dropping %d bytes; %s", + (this->ignore_error ? "(ignoring error) " : ""), fname, + static_cast(bytes), s.ToString().c_str()); + if (this->status->ok()) { + // only keep the first error + *this->status = s; + } + } + }; + + std::unique_ptr file; + Status status = fs_->NewSequentialFile(fname, + fs_->OptimizeForLogRead(file_options_), + &file, nullptr); + std::unique_ptr file_reader( + new SequentialFileReader(std::move(file), fname)); + + if (!status.ok()) { + return status; + } + + LogReporter reporter; + reporter.env = env_; + reporter.info_log = db_options_.info_log.get(); + reporter.fname = fname.c_str(); + reporter.status = &status; + reporter.ignore_error = !db_options_.paranoid_checks; + log::Reader reader(db_options_.info_log, std::move(file_reader), &reporter, + true /*checksum*/, number); + std::string scratch; + Slice record; + + if (reader.ReadRecord(&record, &scratch) && + (status.ok() || !db_options_.paranoid_checks)) { + if (record.size() < WriteBatchInternal::kHeader) { + reporter.Corruption(record.size(), + Status::Corruption("log record too small")); + // TODO read record's till the first no corrupt entry? + } else { + WriteBatch batch; + WriteBatchInternal::SetContents(&batch, record); + *sequence = WriteBatchInternal::Sequence(&batch); + return Status::OK(); + } + } + + // ReadRecord returns false on EOF, which means that the log file is empty. we + // return status.ok() in that case and set sequence number to 0 + *sequence = 0; + return status; +} + +#endif // ROCKSDB_LITE +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/wal_manager.h b/src/rocksdb/db/wal_manager.h new file mode 100644 index 000000000..783bfe99c --- /dev/null +++ b/src/rocksdb/db/wal_manager.h @@ -0,0 +1,114 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/version_set.h" +#include "file/file_util.h" +#include "options/db_options.h" +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/status.h" +#include "rocksdb/transaction_log.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +#ifndef ROCKSDB_LITE + +// WAL manager provides the abstraction for reading the WAL files as a single +// unit. Internally, it opens and reads the files using Reader or Writer +// abstraction. +class WalManager { + public: + WalManager(const ImmutableDBOptions& db_options, + const FileOptions& file_options, const bool seq_per_batch = false) + : db_options_(db_options), + file_options_(file_options), + env_(db_options.env), + fs_(db_options.fs.get()), + purge_wal_files_last_run_(0), + seq_per_batch_(seq_per_batch), + wal_in_db_path_(IsWalDirSameAsDBPath(&db_options)) {} + + Status GetSortedWalFiles(VectorLogPtr& files); + + // Allow user to tail transaction log to find all recent changes to the + // database that are newer than `seq_number`. + Status GetUpdatesSince( + SequenceNumber seq_number, std::unique_ptr* iter, + const TransactionLogIterator::ReadOptions& read_options, + VersionSet* version_set); + + void PurgeObsoleteWALFiles(); + + void ArchiveWALFile(const std::string& fname, uint64_t number); + + Status DeleteFile(const std::string& fname, uint64_t number); + + Status GetLiveWalFile(uint64_t number, std::unique_ptr* log_file); + + Status TEST_ReadFirstRecord(const WalFileType type, const uint64_t number, + SequenceNumber* sequence) { + return ReadFirstRecord(type, number, sequence); + } + + Status TEST_ReadFirstLine(const std::string& fname, const uint64_t number, + SequenceNumber* sequence) { + return ReadFirstLine(fname, number, sequence); + } + + private: + Status GetSortedWalsOfType(const std::string& path, VectorLogPtr& log_files, + WalFileType type); + // Requires: all_logs should be sorted with earliest log file first + // Retains all log files in all_logs which contain updates with seq no. + // Greater Than or Equal to the requested SequenceNumber. + Status RetainProbableWalFiles(VectorLogPtr& all_logs, + const SequenceNumber target); + + Status ReadFirstRecord(const WalFileType type, const uint64_t number, + SequenceNumber* sequence); + + Status ReadFirstLine(const std::string& fname, const uint64_t number, + SequenceNumber* sequence); + + // ------- state from DBImpl ------ + const ImmutableDBOptions& db_options_; + const FileOptions file_options_; + Env* env_; + FileSystem* fs_; + + // ------- WalManager state ------- + // cache for ReadFirstRecord() calls + std::unordered_map read_first_record_cache_; + port::Mutex read_first_record_cache_mutex_; + + // last time when PurgeObsoleteWALFiles ran. + uint64_t purge_wal_files_last_run_; + + bool seq_per_batch_; + + bool wal_in_db_path_; + + // obsolete files will be deleted every this seconds if ttl deletion is + // enabled and archive size_limit is disabled. + static const uint64_t kDefaultIntervalToDeleteObsoleteWAL = 600; +}; + +#endif // ROCKSDB_LITE +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/wal_manager_test.cc b/src/rocksdb/db/wal_manager_test.cc new file mode 100644 index 000000000..26bad368e --- /dev/null +++ b/src/rocksdb/db/wal_manager_test.cc @@ -0,0 +1,338 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include +#include + +#include "rocksdb/cache.h" +#include "rocksdb/write_batch.h" +#include "rocksdb/write_buffer_manager.h" + +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "db/log_writer.h" +#include "db/version_set.h" +#include "db/wal_manager.h" +#include "env/mock_env.h" +#include "file/writable_file_writer.h" +#include "table/mock_table.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +// TODO(icanadi) mock out VersionSet +// TODO(icanadi) move other WalManager-specific tests from db_test here +class WalManagerTest : public testing::Test { + public: + WalManagerTest() + : env_(new MockEnv(Env::Default())), + dbname_(test::PerThreadDBPath("wal_manager_test")), + db_options_(), + table_cache_(NewLRUCache(50000, 16)), + write_buffer_manager_(db_options_.db_write_buffer_size), + current_log_number_(0) { + DestroyDB(dbname_, Options()); + } + + void Init() { + ASSERT_OK(env_->CreateDirIfMissing(dbname_)); + ASSERT_OK(env_->CreateDirIfMissing(ArchivalDirectory(dbname_))); + db_options_.db_paths.emplace_back(dbname_, + std::numeric_limits::max()); + db_options_.wal_dir = dbname_; + db_options_.env = env_.get(); + fs_.reset(new LegacyFileSystemWrapper(env_.get())); + db_options_.fs = fs_; + + versions_.reset(new VersionSet(dbname_, &db_options_, env_options_, + table_cache_.get(), &write_buffer_manager_, + &write_controller_, + /*block_cache_tracer=*/nullptr)); + + wal_manager_.reset(new WalManager(db_options_, env_options_)); + } + + void Reopen() { + wal_manager_.reset(new WalManager(db_options_, env_options_)); + } + + // NOT thread safe + void Put(const std::string& key, const std::string& value) { + assert(current_log_writer_.get() != nullptr); + uint64_t seq = versions_->LastSequence() + 1; + WriteBatch batch; + batch.Put(key, value); + WriteBatchInternal::SetSequence(&batch, seq); + current_log_writer_->AddRecord(WriteBatchInternal::Contents(&batch)); + versions_->SetLastAllocatedSequence(seq); + versions_->SetLastPublishedSequence(seq); + versions_->SetLastSequence(seq); + } + + // NOT thread safe + void RollTheLog(bool /*archived*/) { + current_log_number_++; + std::string fname = ArchivedLogFileName(dbname_, current_log_number_); + std::unique_ptr file; + ASSERT_OK(env_->NewWritableFile(fname, &file, env_options_)); + std::unique_ptr file_writer(new WritableFileWriter( + NewLegacyWritableFileWrapper(std::move(file)), fname, env_options_)); + current_log_writer_.reset(new log::Writer(std::move(file_writer), 0, false)); + } + + void CreateArchiveLogs(int num_logs, int entries_per_log) { + for (int i = 1; i <= num_logs; ++i) { + RollTheLog(true); + for (int k = 0; k < entries_per_log; ++k) { + Put(ToString(k), std::string(1024, 'a')); + } + } + } + + std::unique_ptr OpenTransactionLogIter( + const SequenceNumber seq) { + std::unique_ptr iter; + Status status = wal_manager_->GetUpdatesSince( + seq, &iter, TransactionLogIterator::ReadOptions(), versions_.get()); + EXPECT_OK(status); + return iter; + } + + std::unique_ptr env_; + std::string dbname_; + ImmutableDBOptions db_options_; + WriteController write_controller_; + EnvOptions env_options_; + std::shared_ptr table_cache_; + WriteBufferManager write_buffer_manager_; + std::unique_ptr versions_; + std::unique_ptr wal_manager_; + std::shared_ptr fs_; + + std::unique_ptr current_log_writer_; + uint64_t current_log_number_; +}; + +TEST_F(WalManagerTest, ReadFirstRecordCache) { + Init(); + std::string path = dbname_ + "/000001.log"; + std::unique_ptr file; + ASSERT_OK(env_->NewWritableFile(path, &file, EnvOptions())); + + SequenceNumber s; + ASSERT_OK(wal_manager_->TEST_ReadFirstLine(path, 1 /* number */, &s)); + ASSERT_EQ(s, 0U); + + ASSERT_OK( + wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1 /* number */, &s)); + ASSERT_EQ(s, 0U); + + std::unique_ptr file_writer(new WritableFileWriter( + NewLegacyWritableFileWrapper(std::move(file)), path, EnvOptions())); + log::Writer writer(std::move(file_writer), 1, + db_options_.recycle_log_file_num > 0); + WriteBatch batch; + batch.Put("foo", "bar"); + WriteBatchInternal::SetSequence(&batch, 10); + writer.AddRecord(WriteBatchInternal::Contents(&batch)); + + // TODO(icanadi) move SpecialEnv outside of db_test, so we can reuse it here. + // Waiting for lei to finish with db_test + // env_->count_sequential_reads_ = true; + // sequential_read_counter_ sanity test + // ASSERT_EQ(env_->sequential_read_counter_.Read(), 0); + + ASSERT_OK(wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1, &s)); + ASSERT_EQ(s, 10U); + // did a read + // TODO(icanadi) move SpecialEnv outside of db_test, so we can reuse it here + // ASSERT_EQ(env_->sequential_read_counter_.Read(), 1); + + ASSERT_OK(wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1, &s)); + ASSERT_EQ(s, 10U); + // no new reads since the value is cached + // TODO(icanadi) move SpecialEnv outside of db_test, so we can reuse it here + // ASSERT_EQ(env_->sequential_read_counter_.Read(), 1); +} + +namespace { +uint64_t GetLogDirSize(std::string dir_path, Env* env) { + uint64_t dir_size = 0; + std::vector files; + env->GetChildren(dir_path, &files); + for (auto& f : files) { + uint64_t number; + FileType type; + if (ParseFileName(f, &number, &type) && type == kLogFile) { + std::string const file_path = dir_path + "/" + f; + uint64_t file_size; + env->GetFileSize(file_path, &file_size); + dir_size += file_size; + } + } + return dir_size; +} +std::vector ListSpecificFiles( + Env* env, const std::string& path, const FileType expected_file_type) { + std::vector files; + std::vector file_numbers; + env->GetChildren(path, &files); + uint64_t number; + FileType type; + for (size_t i = 0; i < files.size(); ++i) { + if (ParseFileName(files[i], &number, &type)) { + if (type == expected_file_type) { + file_numbers.push_back(number); + } + } + } + return file_numbers; +} + +int CountRecords(TransactionLogIterator* iter) { + int count = 0; + SequenceNumber lastSequence = 0; + BatchResult res; + while (iter->Valid()) { + res = iter->GetBatch(); + EXPECT_TRUE(res.sequence > lastSequence); + ++count; + lastSequence = res.sequence; + EXPECT_OK(iter->status()); + iter->Next(); + } + return count; +} +} // namespace + +TEST_F(WalManagerTest, WALArchivalSizeLimit) { + db_options_.wal_ttl_seconds = 0; + db_options_.wal_size_limit_mb = 1000; + Init(); + + // TEST : Create WalManager with huge size limit and no ttl. + // Create some archived files and call PurgeObsoleteWALFiles(). + // Count the archived log files that survived. + // Assert that all of them did. + // Change size limit. Re-open WalManager. + // Assert that archive is not greater than wal_size_limit_mb after + // PurgeObsoleteWALFiles() + // Set ttl and time_to_check_ to small values. Re-open db. + // Assert that there are no archived logs left. + + std::string archive_dir = ArchivalDirectory(dbname_); + CreateArchiveLogs(20, 5000); + + std::vector log_files = + ListSpecificFiles(env_.get(), archive_dir, kLogFile); + ASSERT_EQ(log_files.size(), 20U); + + db_options_.wal_size_limit_mb = 8; + Reopen(); + wal_manager_->PurgeObsoleteWALFiles(); + + uint64_t archive_size = GetLogDirSize(archive_dir, env_.get()); + ASSERT_TRUE(archive_size <= db_options_.wal_size_limit_mb * 1024 * 1024); + + db_options_.wal_ttl_seconds = 1; + env_->FakeSleepForMicroseconds(2 * 1000 * 1000); + Reopen(); + wal_manager_->PurgeObsoleteWALFiles(); + + log_files = ListSpecificFiles(env_.get(), archive_dir, kLogFile); + ASSERT_TRUE(log_files.empty()); +} + +TEST_F(WalManagerTest, WALArchivalTtl) { + db_options_.wal_ttl_seconds = 1000; + Init(); + + // TEST : Create WalManager with a ttl and no size limit. + // Create some archived log files and call PurgeObsoleteWALFiles(). + // Assert that files are not deleted + // Reopen db with small ttl. + // Assert that all archived logs was removed. + + std::string archive_dir = ArchivalDirectory(dbname_); + CreateArchiveLogs(20, 5000); + + std::vector log_files = + ListSpecificFiles(env_.get(), archive_dir, kLogFile); + ASSERT_GT(log_files.size(), 0U); + + db_options_.wal_ttl_seconds = 1; + env_->FakeSleepForMicroseconds(3 * 1000 * 1000); + Reopen(); + wal_manager_->PurgeObsoleteWALFiles(); + + log_files = ListSpecificFiles(env_.get(), archive_dir, kLogFile); + ASSERT_TRUE(log_files.empty()); +} + +TEST_F(WalManagerTest, TransactionLogIteratorMoveOverZeroFiles) { + Init(); + RollTheLog(false); + Put("key1", std::string(1024, 'a')); + // Create a zero record WAL file. + RollTheLog(false); + RollTheLog(false); + + Put("key2", std::string(1024, 'a')); + + auto iter = OpenTransactionLogIter(0); + ASSERT_EQ(2, CountRecords(iter.get())); +} + +TEST_F(WalManagerTest, TransactionLogIteratorJustEmptyFile) { + Init(); + RollTheLog(false); + auto iter = OpenTransactionLogIter(0); + // Check that an empty iterator is returned + ASSERT_TRUE(!iter->Valid()); +} + +TEST_F(WalManagerTest, TransactionLogIteratorNewFileWhileScanning) { + Init(); + CreateArchiveLogs(2, 100); + auto iter = OpenTransactionLogIter(0); + CreateArchiveLogs(1, 100); + int i = 0; + for (; iter->Valid(); iter->Next()) { + i++; + } + ASSERT_EQ(i, 200); + // A new log file was added after the iterator was created. + // TryAgain indicates a new iterator is needed to fetch the new data + ASSERT_TRUE(iter->status().IsTryAgain()); + + iter = OpenTransactionLogIter(0); + i = 0; + for (; iter->Valid(); iter->Next()) { + i++; + } + ASSERT_EQ(i, 300); + ASSERT_TRUE(iter->status().ok()); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "SKIPPED as WalManager is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/write_batch.cc b/src/rocksdb/db/write_batch.cc new file mode 100644 index 000000000..d578db59b --- /dev/null +++ b/src/rocksdb/db/write_batch.cc @@ -0,0 +1,2092 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// WriteBatch::rep_ := +// sequence: fixed64 +// count: fixed32 +// data: record[count] +// record := +// kTypeValue varstring varstring +// kTypeDeletion varstring +// kTypeSingleDeletion varstring +// kTypeRangeDeletion varstring varstring +// kTypeMerge varstring varstring +// kTypeColumnFamilyValue varint32 varstring varstring +// kTypeColumnFamilyDeletion varint32 varstring +// kTypeColumnFamilySingleDeletion varint32 varstring +// kTypeColumnFamilyRangeDeletion varint32 varstring varstring +// kTypeColumnFamilyMerge varint32 varstring varstring +// kTypeBeginPrepareXID varstring +// kTypeEndPrepareXID +// kTypeCommitXID varstring +// kTypeRollbackXID varstring +// kTypeBeginPersistedPrepareXID varstring +// kTypeBeginUnprepareXID varstring +// kTypeNoop +// varstring := +// len: varint32 +// data: uint8[len] + +#include "rocksdb/write_batch.h" + +#include +#include +#include +#include +#include +#include + +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "db/dbformat.h" +#include "db/flush_scheduler.h" +#include "db/memtable.h" +#include "db/merge_context.h" +#include "db/snapshot_impl.h" +#include "db/trim_history_scheduler.h" +#include "db/write_batch_internal.h" +#include "monitoring/perf_context_imp.h" +#include "monitoring/statistics.h" +#include "rocksdb/merge_operator.h" +#include "util/autovector.h" +#include "util/cast_util.h" +#include "util/coding.h" +#include "util/duplicate_detector.h" +#include "util/string_util.h" +#include "util/util.h" + +namespace ROCKSDB_NAMESPACE { + +// anon namespace for file-local types +namespace { + +enum ContentFlags : uint32_t { + DEFERRED = 1 << 0, + HAS_PUT = 1 << 1, + HAS_DELETE = 1 << 2, + HAS_SINGLE_DELETE = 1 << 3, + HAS_MERGE = 1 << 4, + HAS_BEGIN_PREPARE = 1 << 5, + HAS_END_PREPARE = 1 << 6, + HAS_COMMIT = 1 << 7, + HAS_ROLLBACK = 1 << 8, + HAS_DELETE_RANGE = 1 << 9, + HAS_BLOB_INDEX = 1 << 10, + HAS_BEGIN_UNPREPARE = 1 << 11, +}; + +struct BatchContentClassifier : public WriteBatch::Handler { + uint32_t content_flags = 0; + + Status PutCF(uint32_t, const Slice&, const Slice&) override { + content_flags |= ContentFlags::HAS_PUT; + return Status::OK(); + } + + Status DeleteCF(uint32_t, const Slice&) override { + content_flags |= ContentFlags::HAS_DELETE; + return Status::OK(); + } + + Status SingleDeleteCF(uint32_t, const Slice&) override { + content_flags |= ContentFlags::HAS_SINGLE_DELETE; + return Status::OK(); + } + + Status DeleteRangeCF(uint32_t, const Slice&, const Slice&) override { + content_flags |= ContentFlags::HAS_DELETE_RANGE; + return Status::OK(); + } + + Status MergeCF(uint32_t, const Slice&, const Slice&) override { + content_flags |= ContentFlags::HAS_MERGE; + return Status::OK(); + } + + Status PutBlobIndexCF(uint32_t, const Slice&, const Slice&) override { + content_flags |= ContentFlags::HAS_BLOB_INDEX; + return Status::OK(); + } + + Status MarkBeginPrepare(bool unprepare) override { + content_flags |= ContentFlags::HAS_BEGIN_PREPARE; + if (unprepare) { + content_flags |= ContentFlags::HAS_BEGIN_UNPREPARE; + } + return Status::OK(); + } + + Status MarkEndPrepare(const Slice&) override { + content_flags |= ContentFlags::HAS_END_PREPARE; + return Status::OK(); + } + + Status MarkCommit(const Slice&) override { + content_flags |= ContentFlags::HAS_COMMIT; + return Status::OK(); + } + + Status MarkRollback(const Slice&) override { + content_flags |= ContentFlags::HAS_ROLLBACK; + return Status::OK(); + } +}; + +class TimestampAssigner : public WriteBatch::Handler { + public: + explicit TimestampAssigner(const Slice& ts) + : timestamp_(ts), timestamps_(kEmptyTimestampList) {} + explicit TimestampAssigner(const std::vector& ts_list) + : timestamps_(ts_list) { + SanityCheck(); + } + ~TimestampAssigner() override {} + + Status PutCF(uint32_t, const Slice& key, const Slice&) override { + AssignTimestamp(key); + ++idx_; + return Status::OK(); + } + + Status DeleteCF(uint32_t, const Slice& key) override { + AssignTimestamp(key); + ++idx_; + return Status::OK(); + } + + Status SingleDeleteCF(uint32_t, const Slice& key) override { + AssignTimestamp(key); + ++idx_; + return Status::OK(); + } + + Status DeleteRangeCF(uint32_t, const Slice& begin_key, + const Slice& end_key) override { + AssignTimestamp(begin_key); + AssignTimestamp(end_key); + ++idx_; + return Status::OK(); + } + + Status MergeCF(uint32_t, const Slice& key, const Slice&) override { + AssignTimestamp(key); + ++idx_; + return Status::OK(); + } + + Status PutBlobIndexCF(uint32_t, const Slice&, const Slice&) override { + // TODO (yanqin): support blob db in the future. + return Status::OK(); + } + + Status MarkBeginPrepare(bool) override { + // TODO (yanqin): support in the future. + return Status::OK(); + } + + Status MarkEndPrepare(const Slice&) override { + // TODO (yanqin): support in the future. + return Status::OK(); + } + + Status MarkCommit(const Slice&) override { + // TODO (yanqin): support in the future. + return Status::OK(); + } + + Status MarkRollback(const Slice&) override { + // TODO (yanqin): support in the future. + return Status::OK(); + } + + private: + void SanityCheck() const { + assert(!timestamps_.empty()); +#ifndef NDEBUG + const size_t ts_sz = timestamps_[0].size(); + for (size_t i = 1; i != timestamps_.size(); ++i) { + assert(ts_sz == timestamps_[i].size()); + } +#endif // !NDEBUG + } + + void AssignTimestamp(const Slice& key) { + assert(timestamps_.empty() || idx_ < timestamps_.size()); + const Slice& ts = timestamps_.empty() ? timestamp_ : timestamps_[idx_]; + size_t ts_sz = ts.size(); + char* ptr = const_cast(key.data() + key.size() - ts_sz); + memcpy(ptr, ts.data(), ts_sz); + } + + static const std::vector kEmptyTimestampList; + const Slice timestamp_; + const std::vector& timestamps_; + size_t idx_ = 0; + + // No copy or move. + TimestampAssigner(const TimestampAssigner&) = delete; + TimestampAssigner(TimestampAssigner&&) = delete; + TimestampAssigner& operator=(const TimestampAssigner&) = delete; + TimestampAssigner&& operator=(TimestampAssigner&&) = delete; +}; +const std::vector TimestampAssigner::kEmptyTimestampList; + +} // anon namespace + +struct SavePoints { + std::stack> stack; +}; + +WriteBatch::WriteBatch(size_t reserved_bytes, size_t max_bytes) + : content_flags_(0), max_bytes_(max_bytes), rep_(), timestamp_size_(0) { + rep_.reserve((reserved_bytes > WriteBatchInternal::kHeader) + ? reserved_bytes + : WriteBatchInternal::kHeader); + rep_.resize(WriteBatchInternal::kHeader); +} + +WriteBatch::WriteBatch(size_t reserved_bytes, size_t max_bytes, size_t ts_sz) + : content_flags_(0), max_bytes_(max_bytes), rep_(), timestamp_size_(ts_sz) { + rep_.reserve((reserved_bytes > WriteBatchInternal::kHeader) ? + reserved_bytes : WriteBatchInternal::kHeader); + rep_.resize(WriteBatchInternal::kHeader); +} + +WriteBatch::WriteBatch(const std::string& rep) + : content_flags_(ContentFlags::DEFERRED), + max_bytes_(0), + rep_(rep), + timestamp_size_(0) {} + +WriteBatch::WriteBatch(std::string&& rep) + : content_flags_(ContentFlags::DEFERRED), + max_bytes_(0), + rep_(std::move(rep)), + timestamp_size_(0) {} + +WriteBatch::WriteBatch(const WriteBatch& src) + : wal_term_point_(src.wal_term_point_), + content_flags_(src.content_flags_.load(std::memory_order_relaxed)), + max_bytes_(src.max_bytes_), + rep_(src.rep_), + timestamp_size_(src.timestamp_size_) { + if (src.save_points_ != nullptr) { + save_points_.reset(new SavePoints()); + save_points_->stack = src.save_points_->stack; + } +} + +WriteBatch::WriteBatch(WriteBatch&& src) noexcept + : save_points_(std::move(src.save_points_)), + wal_term_point_(std::move(src.wal_term_point_)), + content_flags_(src.content_flags_.load(std::memory_order_relaxed)), + max_bytes_(src.max_bytes_), + rep_(std::move(src.rep_)), + timestamp_size_(src.timestamp_size_) {} + +WriteBatch& WriteBatch::operator=(const WriteBatch& src) { + if (&src != this) { + this->~WriteBatch(); + new (this) WriteBatch(src); + } + return *this; +} + +WriteBatch& WriteBatch::operator=(WriteBatch&& src) { + if (&src != this) { + this->~WriteBatch(); + new (this) WriteBatch(std::move(src)); + } + return *this; +} + +WriteBatch::~WriteBatch() { } + +WriteBatch::Handler::~Handler() { } + +void WriteBatch::Handler::LogData(const Slice& /*blob*/) { + // If the user has not specified something to do with blobs, then we ignore + // them. +} + +bool WriteBatch::Handler::Continue() { + return true; +} + +void WriteBatch::Clear() { + rep_.clear(); + rep_.resize(WriteBatchInternal::kHeader); + + content_flags_.store(0, std::memory_order_relaxed); + + if (save_points_ != nullptr) { + while (!save_points_->stack.empty()) { + save_points_->stack.pop(); + } + } + + wal_term_point_.clear(); +} + +uint32_t WriteBatch::Count() const { return WriteBatchInternal::Count(this); } + +uint32_t WriteBatch::ComputeContentFlags() const { + auto rv = content_flags_.load(std::memory_order_relaxed); + if ((rv & ContentFlags::DEFERRED) != 0) { + BatchContentClassifier classifier; + Iterate(&classifier); + rv = classifier.content_flags; + + // this method is conceptually const, because it is performing a lazy + // computation that doesn't affect the abstract state of the batch. + // content_flags_ is marked mutable so that we can perform the + // following assignment + content_flags_.store(rv, std::memory_order_relaxed); + } + return rv; +} + +void WriteBatch::MarkWalTerminationPoint() { + wal_term_point_.size = GetDataSize(); + wal_term_point_.count = Count(); + wal_term_point_.content_flags = content_flags_; +} + +bool WriteBatch::HasPut() const { + return (ComputeContentFlags() & ContentFlags::HAS_PUT) != 0; +} + +bool WriteBatch::HasDelete() const { + return (ComputeContentFlags() & ContentFlags::HAS_DELETE) != 0; +} + +bool WriteBatch::HasSingleDelete() const { + return (ComputeContentFlags() & ContentFlags::HAS_SINGLE_DELETE) != 0; +} + +bool WriteBatch::HasDeleteRange() const { + return (ComputeContentFlags() & ContentFlags::HAS_DELETE_RANGE) != 0; +} + +bool WriteBatch::HasMerge() const { + return (ComputeContentFlags() & ContentFlags::HAS_MERGE) != 0; +} + +bool ReadKeyFromWriteBatchEntry(Slice* input, Slice* key, bool cf_record) { + assert(input != nullptr && key != nullptr); + // Skip tag byte + input->remove_prefix(1); + + if (cf_record) { + // Skip column_family bytes + uint32_t cf; + if (!GetVarint32(input, &cf)) { + return false; + } + } + + // Extract key + return GetLengthPrefixedSlice(input, key); +} + +bool WriteBatch::HasBeginPrepare() const { + return (ComputeContentFlags() & ContentFlags::HAS_BEGIN_PREPARE) != 0; +} + +bool WriteBatch::HasEndPrepare() const { + return (ComputeContentFlags() & ContentFlags::HAS_END_PREPARE) != 0; +} + +bool WriteBatch::HasCommit() const { + return (ComputeContentFlags() & ContentFlags::HAS_COMMIT) != 0; +} + +bool WriteBatch::HasRollback() const { + return (ComputeContentFlags() & ContentFlags::HAS_ROLLBACK) != 0; +} + +Status ReadRecordFromWriteBatch(Slice* input, char* tag, + uint32_t* column_family, Slice* key, + Slice* value, Slice* blob, Slice* xid) { + assert(key != nullptr && value != nullptr); + *tag = (*input)[0]; + input->remove_prefix(1); + *column_family = 0; // default + switch (*tag) { + case kTypeColumnFamilyValue: + if (!GetVarint32(input, column_family)) { + return Status::Corruption("bad WriteBatch Put"); + } + FALLTHROUGH_INTENDED; + case kTypeValue: + if (!GetLengthPrefixedSlice(input, key) || + !GetLengthPrefixedSlice(input, value)) { + return Status::Corruption("bad WriteBatch Put"); + } + break; + case kTypeColumnFamilyDeletion: + case kTypeColumnFamilySingleDeletion: + if (!GetVarint32(input, column_family)) { + return Status::Corruption("bad WriteBatch Delete"); + } + FALLTHROUGH_INTENDED; + case kTypeDeletion: + case kTypeSingleDeletion: + if (!GetLengthPrefixedSlice(input, key)) { + return Status::Corruption("bad WriteBatch Delete"); + } + break; + case kTypeColumnFamilyRangeDeletion: + if (!GetVarint32(input, column_family)) { + return Status::Corruption("bad WriteBatch DeleteRange"); + } + FALLTHROUGH_INTENDED; + case kTypeRangeDeletion: + // for range delete, "key" is begin_key, "value" is end_key + if (!GetLengthPrefixedSlice(input, key) || + !GetLengthPrefixedSlice(input, value)) { + return Status::Corruption("bad WriteBatch DeleteRange"); + } + break; + case kTypeColumnFamilyMerge: + if (!GetVarint32(input, column_family)) { + return Status::Corruption("bad WriteBatch Merge"); + } + FALLTHROUGH_INTENDED; + case kTypeMerge: + if (!GetLengthPrefixedSlice(input, key) || + !GetLengthPrefixedSlice(input, value)) { + return Status::Corruption("bad WriteBatch Merge"); + } + break; + case kTypeColumnFamilyBlobIndex: + if (!GetVarint32(input, column_family)) { + return Status::Corruption("bad WriteBatch BlobIndex"); + } + FALLTHROUGH_INTENDED; + case kTypeBlobIndex: + if (!GetLengthPrefixedSlice(input, key) || + !GetLengthPrefixedSlice(input, value)) { + return Status::Corruption("bad WriteBatch BlobIndex"); + } + break; + case kTypeLogData: + assert(blob != nullptr); + if (!GetLengthPrefixedSlice(input, blob)) { + return Status::Corruption("bad WriteBatch Blob"); + } + break; + case kTypeNoop: + case kTypeBeginPrepareXID: + // This indicates that the prepared batch is also persisted in the db. + // This is used in WritePreparedTxn + case kTypeBeginPersistedPrepareXID: + // This is used in WriteUnpreparedTxn + case kTypeBeginUnprepareXID: + break; + case kTypeEndPrepareXID: + if (!GetLengthPrefixedSlice(input, xid)) { + return Status::Corruption("bad EndPrepare XID"); + } + break; + case kTypeCommitXID: + if (!GetLengthPrefixedSlice(input, xid)) { + return Status::Corruption("bad Commit XID"); + } + break; + case kTypeRollbackXID: + if (!GetLengthPrefixedSlice(input, xid)) { + return Status::Corruption("bad Rollback XID"); + } + break; + default: + return Status::Corruption("unknown WriteBatch tag"); + } + return Status::OK(); +} + +Status WriteBatch::Iterate(Handler* handler) const { + if (rep_.size() < WriteBatchInternal::kHeader) { + return Status::Corruption("malformed WriteBatch (too small)"); + } + + return WriteBatchInternal::Iterate(this, handler, WriteBatchInternal::kHeader, + rep_.size()); +} + +Status WriteBatchInternal::Iterate(const WriteBatch* wb, + WriteBatch::Handler* handler, size_t begin, + size_t end) { + if (begin > wb->rep_.size() || end > wb->rep_.size() || end < begin) { + return Status::Corruption("Invalid start/end bounds for Iterate"); + } + assert(begin <= end); + Slice input(wb->rep_.data() + begin, static_cast(end - begin)); + bool whole_batch = + (begin == WriteBatchInternal::kHeader) && (end == wb->rep_.size()); + + Slice key, value, blob, xid; + // Sometimes a sub-batch starts with a Noop. We want to exclude such Noops as + // the batch boundary symbols otherwise we would mis-count the number of + // batches. We do that by checking whether the accumulated batch is empty + // before seeing the next Noop. + bool empty_batch = true; + uint32_t found = 0; + Status s; + char tag = 0; + uint32_t column_family = 0; // default + bool last_was_try_again = false; + bool handler_continue = true; + while (((s.ok() && !input.empty()) || UNLIKELY(s.IsTryAgain()))) { + handler_continue = handler->Continue(); + if (!handler_continue) { + break; + } + + if (LIKELY(!s.IsTryAgain())) { + last_was_try_again = false; + tag = 0; + column_family = 0; // default + + s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value, + &blob, &xid); + if (!s.ok()) { + return s; + } + } else { + assert(s.IsTryAgain()); + assert(!last_was_try_again); // to detect infinite loop bugs + if (UNLIKELY(last_was_try_again)) { + return Status::Corruption( + "two consecutive TryAgain in WriteBatch handler; this is either a " + "software bug or data corruption."); + } + last_was_try_again = true; + s = Status::OK(); + } + + switch (tag) { + case kTypeColumnFamilyValue: + case kTypeValue: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_PUT)); + s = handler->PutCF(column_family, key, value); + if (LIKELY(s.ok())) { + empty_batch = false; + found++; + } + break; + case kTypeColumnFamilyDeletion: + case kTypeDeletion: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_DELETE)); + s = handler->DeleteCF(column_family, key); + if (LIKELY(s.ok())) { + empty_batch = false; + found++; + } + break; + case kTypeColumnFamilySingleDeletion: + case kTypeSingleDeletion: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_SINGLE_DELETE)); + s = handler->SingleDeleteCF(column_family, key); + if (LIKELY(s.ok())) { + empty_batch = false; + found++; + } + break; + case kTypeColumnFamilyRangeDeletion: + case kTypeRangeDeletion: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_DELETE_RANGE)); + s = handler->DeleteRangeCF(column_family, key, value); + if (LIKELY(s.ok())) { + empty_batch = false; + found++; + } + break; + case kTypeColumnFamilyMerge: + case kTypeMerge: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_MERGE)); + s = handler->MergeCF(column_family, key, value); + if (LIKELY(s.ok())) { + empty_batch = false; + found++; + } + break; + case kTypeColumnFamilyBlobIndex: + case kTypeBlobIndex: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_BLOB_INDEX)); + s = handler->PutBlobIndexCF(column_family, key, value); + if (LIKELY(s.ok())) { + found++; + } + break; + case kTypeLogData: + handler->LogData(blob); + // A batch might have nothing but LogData. It is still a batch. + empty_batch = false; + break; + case kTypeBeginPrepareXID: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE)); + handler->MarkBeginPrepare(); + empty_batch = false; + if (!handler->WriteAfterCommit()) { + s = Status::NotSupported( + "WriteCommitted txn tag when write_after_commit_ is disabled (in " + "WritePrepared/WriteUnprepared mode). If it is not due to " + "corruption, the WAL must be emptied before changing the " + "WritePolicy."); + } + if (handler->WriteBeforePrepare()) { + s = Status::NotSupported( + "WriteCommitted txn tag when write_before_prepare_ is enabled " + "(in WriteUnprepared mode). If it is not due to corruption, the " + "WAL must be emptied before changing the WritePolicy."); + } + break; + case kTypeBeginPersistedPrepareXID: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE)); + handler->MarkBeginPrepare(); + empty_batch = false; + if (handler->WriteAfterCommit()) { + s = Status::NotSupported( + "WritePrepared/WriteUnprepared txn tag when write_after_commit_ " + "is enabled (in default WriteCommitted mode). If it is not due " + "to corruption, the WAL must be emptied before changing the " + "WritePolicy."); + } + break; + case kTypeBeginUnprepareXID: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_UNPREPARE)); + handler->MarkBeginPrepare(true /* unprepared */); + empty_batch = false; + if (handler->WriteAfterCommit()) { + s = Status::NotSupported( + "WriteUnprepared txn tag when write_after_commit_ is enabled (in " + "default WriteCommitted mode). If it is not due to corruption, " + "the WAL must be emptied before changing the WritePolicy."); + } + if (!handler->WriteBeforePrepare()) { + s = Status::NotSupported( + "WriteUnprepared txn tag when write_before_prepare_ is disabled " + "(in WriteCommitted/WritePrepared mode). If it is not due to " + "corruption, the WAL must be emptied before changing the " + "WritePolicy."); + } + break; + case kTypeEndPrepareXID: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_END_PREPARE)); + handler->MarkEndPrepare(xid); + empty_batch = true; + break; + case kTypeCommitXID: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_COMMIT)); + handler->MarkCommit(xid); + empty_batch = true; + break; + case kTypeRollbackXID: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_ROLLBACK)); + handler->MarkRollback(xid); + empty_batch = true; + break; + case kTypeNoop: + handler->MarkNoop(empty_batch); + empty_batch = true; + break; + default: + return Status::Corruption("unknown WriteBatch tag"); + } + } + if (!s.ok()) { + return s; + } + if (handler_continue && whole_batch && + found != WriteBatchInternal::Count(wb)) { + return Status::Corruption("WriteBatch has wrong count"); + } else { + return Status::OK(); + } +} + +bool WriteBatchInternal::IsLatestPersistentState(const WriteBatch* b) { + return b->is_latest_persistent_state_; +} + +void WriteBatchInternal::SetAsLastestPersistentState(WriteBatch* b) { + b->is_latest_persistent_state_ = true; +} + +uint32_t WriteBatchInternal::Count(const WriteBatch* b) { + return DecodeFixed32(b->rep_.data() + 8); +} + +void WriteBatchInternal::SetCount(WriteBatch* b, uint32_t n) { + EncodeFixed32(&b->rep_[8], n); +} + +SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) { + return SequenceNumber(DecodeFixed64(b->rep_.data())); +} + +void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) { + EncodeFixed64(&b->rep_[0], seq); +} + +size_t WriteBatchInternal::GetFirstOffset(WriteBatch* /*b*/) { + return WriteBatchInternal::kHeader; +} + +Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id, + const Slice& key, const Slice& value) { + if (key.size() > size_t{port::kMaxUint32}) { + return Status::InvalidArgument("key is too large"); + } + if (value.size() > size_t{port::kMaxUint32}) { + return Status::InvalidArgument("value is too large"); + } + + LocalSavePoint save(b); + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeValue)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilyValue)); + PutVarint32(&b->rep_, column_family_id); + } + if (0 == b->timestamp_size_) { + PutLengthPrefixedSlice(&b->rep_, key); + } else { + PutVarint32(&b->rep_, + static_cast(key.size() + b->timestamp_size_)); + b->rep_.append(key.data(), key.size()); + b->rep_.append(b->timestamp_size_, '\0'); + } + PutLengthPrefixedSlice(&b->rep_, value); + b->content_flags_.store( + b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT, + std::memory_order_relaxed); + return save.commit(); +} + +Status WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) { + return WriteBatchInternal::Put(this, GetColumnFamilyID(column_family), key, + value); +} + +Status WriteBatchInternal::CheckSlicePartsLength(const SliceParts& key, + const SliceParts& value) { + size_t total_key_bytes = 0; + for (int i = 0; i < key.num_parts; ++i) { + total_key_bytes += key.parts[i].size(); + } + if (total_key_bytes >= size_t{port::kMaxUint32}) { + return Status::InvalidArgument("key is too large"); + } + + size_t total_value_bytes = 0; + for (int i = 0; i < value.num_parts; ++i) { + total_value_bytes += value.parts[i].size(); + } + if (total_value_bytes >= size_t{port::kMaxUint32}) { + return Status::InvalidArgument("value is too large"); + } + return Status::OK(); +} + +Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id, + const SliceParts& key, const SliceParts& value) { + Status s = CheckSlicePartsLength(key, value); + if (!s.ok()) { + return s; + } + + LocalSavePoint save(b); + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeValue)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilyValue)); + PutVarint32(&b->rep_, column_family_id); + } + if (0 == b->timestamp_size_) { + PutLengthPrefixedSliceParts(&b->rep_, key); + } else { + PutLengthPrefixedSlicePartsWithPadding(&b->rep_, key, b->timestamp_size_); + } + PutLengthPrefixedSliceParts(&b->rep_, value); + b->content_flags_.store( + b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT, + std::memory_order_relaxed); + return save.commit(); +} + +Status WriteBatch::Put(ColumnFamilyHandle* column_family, const SliceParts& key, + const SliceParts& value) { + return WriteBatchInternal::Put(this, GetColumnFamilyID(column_family), key, + value); +} + +Status WriteBatchInternal::InsertNoop(WriteBatch* b) { + b->rep_.push_back(static_cast(kTypeNoop)); + return Status::OK(); +} + +Status WriteBatchInternal::MarkEndPrepare(WriteBatch* b, const Slice& xid, + bool write_after_commit, + bool unprepared_batch) { + // a manually constructed batch can only contain one prepare section + assert(b->rep_[12] == static_cast(kTypeNoop)); + + // all savepoints up to this point are cleared + if (b->save_points_ != nullptr) { + while (!b->save_points_->stack.empty()) { + b->save_points_->stack.pop(); + } + } + + // rewrite noop as begin marker + b->rep_[12] = static_cast( + write_after_commit ? kTypeBeginPrepareXID + : (unprepared_batch ? kTypeBeginUnprepareXID + : kTypeBeginPersistedPrepareXID)); + b->rep_.push_back(static_cast(kTypeEndPrepareXID)); + PutLengthPrefixedSlice(&b->rep_, xid); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_END_PREPARE | + ContentFlags::HAS_BEGIN_PREPARE, + std::memory_order_relaxed); + if (unprepared_batch) { + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_BEGIN_UNPREPARE, + std::memory_order_relaxed); + } + return Status::OK(); +} + +Status WriteBatchInternal::MarkCommit(WriteBatch* b, const Slice& xid) { + b->rep_.push_back(static_cast(kTypeCommitXID)); + PutLengthPrefixedSlice(&b->rep_, xid); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_COMMIT, + std::memory_order_relaxed); + return Status::OK(); +} + +Status WriteBatchInternal::MarkRollback(WriteBatch* b, const Slice& xid) { + b->rep_.push_back(static_cast(kTypeRollbackXID)); + PutLengthPrefixedSlice(&b->rep_, xid); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_ROLLBACK, + std::memory_order_relaxed); + return Status::OK(); +} + +Status WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id, + const Slice& key) { + LocalSavePoint save(b); + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeDeletion)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilyDeletion)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSlice(&b->rep_, key); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_DELETE, + std::memory_order_relaxed); + return save.commit(); +} + +Status WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key) { + return WriteBatchInternal::Delete(this, GetColumnFamilyID(column_family), + key); +} + +Status WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id, + const SliceParts& key) { + LocalSavePoint save(b); + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeDeletion)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilyDeletion)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSliceParts(&b->rep_, key); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_DELETE, + std::memory_order_relaxed); + return save.commit(); +} + +Status WriteBatch::Delete(ColumnFamilyHandle* column_family, + const SliceParts& key) { + return WriteBatchInternal::Delete(this, GetColumnFamilyID(column_family), + key); +} + +Status WriteBatchInternal::SingleDelete(WriteBatch* b, + uint32_t column_family_id, + const Slice& key) { + LocalSavePoint save(b); + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeSingleDeletion)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilySingleDeletion)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSlice(&b->rep_, key); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_SINGLE_DELETE, + std::memory_order_relaxed); + return save.commit(); +} + +Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family, + const Slice& key) { + return WriteBatchInternal::SingleDelete( + this, GetColumnFamilyID(column_family), key); +} + +Status WriteBatchInternal::SingleDelete(WriteBatch* b, + uint32_t column_family_id, + const SliceParts& key) { + LocalSavePoint save(b); + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeSingleDeletion)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilySingleDeletion)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSliceParts(&b->rep_, key); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_SINGLE_DELETE, + std::memory_order_relaxed); + return save.commit(); +} + +Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family, + const SliceParts& key) { + return WriteBatchInternal::SingleDelete( + this, GetColumnFamilyID(column_family), key); +} + +Status WriteBatchInternal::DeleteRange(WriteBatch* b, uint32_t column_family_id, + const Slice& begin_key, + const Slice& end_key) { + LocalSavePoint save(b); + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeRangeDeletion)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilyRangeDeletion)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSlice(&b->rep_, begin_key); + PutLengthPrefixedSlice(&b->rep_, end_key); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_DELETE_RANGE, + std::memory_order_relaxed); + return save.commit(); +} + +Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family, + const Slice& begin_key, const Slice& end_key) { + return WriteBatchInternal::DeleteRange(this, GetColumnFamilyID(column_family), + begin_key, end_key); +} + +Status WriteBatchInternal::DeleteRange(WriteBatch* b, uint32_t column_family_id, + const SliceParts& begin_key, + const SliceParts& end_key) { + LocalSavePoint save(b); + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeRangeDeletion)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilyRangeDeletion)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSliceParts(&b->rep_, begin_key); + PutLengthPrefixedSliceParts(&b->rep_, end_key); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_DELETE_RANGE, + std::memory_order_relaxed); + return save.commit(); +} + +Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family, + const SliceParts& begin_key, + const SliceParts& end_key) { + return WriteBatchInternal::DeleteRange(this, GetColumnFamilyID(column_family), + begin_key, end_key); +} + +Status WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id, + const Slice& key, const Slice& value) { + if (key.size() > size_t{port::kMaxUint32}) { + return Status::InvalidArgument("key is too large"); + } + if (value.size() > size_t{port::kMaxUint32}) { + return Status::InvalidArgument("value is too large"); + } + + LocalSavePoint save(b); + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeMerge)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilyMerge)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSlice(&b->rep_, key); + PutLengthPrefixedSlice(&b->rep_, value); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_MERGE, + std::memory_order_relaxed); + return save.commit(); +} + +Status WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) { + return WriteBatchInternal::Merge(this, GetColumnFamilyID(column_family), key, + value); +} + +Status WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id, + const SliceParts& key, + const SliceParts& value) { + Status s = CheckSlicePartsLength(key, value); + if (!s.ok()) { + return s; + } + + LocalSavePoint save(b); + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeMerge)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilyMerge)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSliceParts(&b->rep_, key); + PutLengthPrefixedSliceParts(&b->rep_, value); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_MERGE, + std::memory_order_relaxed); + return save.commit(); +} + +Status WriteBatch::Merge(ColumnFamilyHandle* column_family, + const SliceParts& key, const SliceParts& value) { + return WriteBatchInternal::Merge(this, GetColumnFamilyID(column_family), key, + value); +} + +Status WriteBatchInternal::PutBlobIndex(WriteBatch* b, + uint32_t column_family_id, + const Slice& key, const Slice& value) { + LocalSavePoint save(b); + WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1); + if (column_family_id == 0) { + b->rep_.push_back(static_cast(kTypeBlobIndex)); + } else { + b->rep_.push_back(static_cast(kTypeColumnFamilyBlobIndex)); + PutVarint32(&b->rep_, column_family_id); + } + PutLengthPrefixedSlice(&b->rep_, key); + PutLengthPrefixedSlice(&b->rep_, value); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_BLOB_INDEX, + std::memory_order_relaxed); + return save.commit(); +} + +Status WriteBatch::PutLogData(const Slice& blob) { + LocalSavePoint save(this); + rep_.push_back(static_cast(kTypeLogData)); + PutLengthPrefixedSlice(&rep_, blob); + return save.commit(); +} + +void WriteBatch::SetSavePoint() { + if (save_points_ == nullptr) { + save_points_.reset(new SavePoints()); + } + // Record length and count of current batch of writes. + save_points_->stack.push(SavePoint( + GetDataSize(), Count(), content_flags_.load(std::memory_order_relaxed))); +} + +Status WriteBatch::RollbackToSavePoint() { + if (save_points_ == nullptr || save_points_->stack.size() == 0) { + return Status::NotFound(); + } + + // Pop the most recent savepoint off the stack + SavePoint savepoint = save_points_->stack.top(); + save_points_->stack.pop(); + + assert(savepoint.size <= rep_.size()); + assert(static_cast(savepoint.count) <= Count()); + + if (savepoint.size == rep_.size()) { + // No changes to rollback + } else if (savepoint.size == 0) { + // Rollback everything + Clear(); + } else { + rep_.resize(savepoint.size); + WriteBatchInternal::SetCount(this, savepoint.count); + content_flags_.store(savepoint.content_flags, std::memory_order_relaxed); + } + + return Status::OK(); +} + +Status WriteBatch::PopSavePoint() { + if (save_points_ == nullptr || save_points_->stack.size() == 0) { + return Status::NotFound(); + } + + // Pop the most recent savepoint off the stack + save_points_->stack.pop(); + + return Status::OK(); +} + +Status WriteBatch::AssignTimestamp(const Slice& ts) { + TimestampAssigner ts_assigner(ts); + return Iterate(&ts_assigner); +} + +Status WriteBatch::AssignTimestamps(const std::vector& ts_list) { + TimestampAssigner ts_assigner(ts_list); + return Iterate(&ts_assigner); +} + +class MemTableInserter : public WriteBatch::Handler { + + SequenceNumber sequence_; + ColumnFamilyMemTables* const cf_mems_; + FlushScheduler* const flush_scheduler_; + TrimHistoryScheduler* const trim_history_scheduler_; + const bool ignore_missing_column_families_; + const uint64_t recovering_log_number_; + // log number that all Memtables inserted into should reference + uint64_t log_number_ref_; + DBImpl* db_; + const bool concurrent_memtable_writes_; + bool post_info_created_; + + bool* has_valid_writes_; + // On some (!) platforms just default creating + // a map is too expensive in the Write() path as they + // cause memory allocations though unused. + // Make creation optional but do not incur + // std::unique_ptr additional allocation + using MemPostInfoMap = std::map; + using PostMapType = std::aligned_storage::type; + PostMapType mem_post_info_map_; + // current recovered transaction we are rebuilding (recovery) + WriteBatch* rebuilding_trx_; + SequenceNumber rebuilding_trx_seq_; + // Increase seq number once per each write batch. Otherwise increase it once + // per key. + bool seq_per_batch_; + // Whether the memtable write will be done only after the commit + bool write_after_commit_; + // Whether memtable write can be done before prepare + bool write_before_prepare_; + // Whether this batch was unprepared or not + bool unprepared_batch_; + using DupDetector = std::aligned_storage::type; + DupDetector duplicate_detector_; + bool dup_dectector_on_; + + bool hint_per_batch_; + bool hint_created_; + // Hints for this batch + using HintMap = std::unordered_map; + using HintMapType = std::aligned_storage::type; + HintMapType hint_; + + HintMap& GetHintMap() { + assert(hint_per_batch_); + if (!hint_created_) { + new (&hint_) HintMap(); + hint_created_ = true; + } + return *reinterpret_cast(&hint_); + } + + MemPostInfoMap& GetPostMap() { + assert(concurrent_memtable_writes_); + if(!post_info_created_) { + new (&mem_post_info_map_) MemPostInfoMap(); + post_info_created_ = true; + } + return *reinterpret_cast(&mem_post_info_map_); + } + + bool IsDuplicateKeySeq(uint32_t column_family_id, const Slice& key) { + assert(!write_after_commit_); + assert(rebuilding_trx_ != nullptr); + if (!dup_dectector_on_) { + new (&duplicate_detector_) DuplicateDetector(db_); + dup_dectector_on_ = true; + } + return reinterpret_cast + (&duplicate_detector_)->IsDuplicateKeySeq(column_family_id, key, sequence_); + } + + protected: + bool WriteBeforePrepare() const override { return write_before_prepare_; } + bool WriteAfterCommit() const override { return write_after_commit_; } + + public: + // cf_mems should not be shared with concurrent inserters + MemTableInserter(SequenceNumber _sequence, ColumnFamilyMemTables* cf_mems, + FlushScheduler* flush_scheduler, + TrimHistoryScheduler* trim_history_scheduler, + bool ignore_missing_column_families, + uint64_t recovering_log_number, DB* db, + bool concurrent_memtable_writes, + bool* has_valid_writes = nullptr, bool seq_per_batch = false, + bool batch_per_txn = true, bool hint_per_batch = false) + : sequence_(_sequence), + cf_mems_(cf_mems), + flush_scheduler_(flush_scheduler), + trim_history_scheduler_(trim_history_scheduler), + ignore_missing_column_families_(ignore_missing_column_families), + recovering_log_number_(recovering_log_number), + log_number_ref_(0), + db_(static_cast_with_check(db)), + concurrent_memtable_writes_(concurrent_memtable_writes), + post_info_created_(false), + has_valid_writes_(has_valid_writes), + rebuilding_trx_(nullptr), + rebuilding_trx_seq_(0), + seq_per_batch_(seq_per_batch), + // Write after commit currently uses one seq per key (instead of per + // batch). So seq_per_batch being false indicates write_after_commit + // approach. + write_after_commit_(!seq_per_batch), + // WriteUnprepared can write WriteBatches per transaction, so + // batch_per_txn being false indicates write_before_prepare. + write_before_prepare_(!batch_per_txn), + unprepared_batch_(false), + duplicate_detector_(), + dup_dectector_on_(false), + hint_per_batch_(hint_per_batch), + hint_created_(false) { + assert(cf_mems_); + } + + ~MemTableInserter() override { + if (dup_dectector_on_) { + reinterpret_cast + (&duplicate_detector_)->~DuplicateDetector(); + } + if (post_info_created_) { + reinterpret_cast + (&mem_post_info_map_)->~MemPostInfoMap(); + } + if (hint_created_) { + for (auto iter : GetHintMap()) { + delete[] reinterpret_cast(iter.second); + } + reinterpret_cast(&hint_)->~HintMap(); + } + delete rebuilding_trx_; + } + + MemTableInserter(const MemTableInserter&) = delete; + MemTableInserter& operator=(const MemTableInserter&) = delete; + + // The batch seq is regularly restarted; In normal mode it is set when + // MemTableInserter is constructed in the write thread and in recovery mode it + // is set when a batch, which is tagged with seq, is read from the WAL. + // Within a sequenced batch, which could be a merge of multiple batches, we + // have two policies to advance the seq: i) seq_per_key (default) and ii) + // seq_per_batch. To implement the latter we need to mark the boundary between + // the individual batches. The approach is this: 1) Use the terminating + // markers to indicate the boundary (kTypeEndPrepareXID, kTypeCommitXID, + // kTypeRollbackXID) 2) Terminate a batch with kTypeNoop in the absence of a + // natural boundary marker. + void MaybeAdvanceSeq(bool batch_boundry = false) { + if (batch_boundry == seq_per_batch_) { + sequence_++; + } + } + + void set_log_number_ref(uint64_t log) { log_number_ref_ = log; } + + SequenceNumber sequence() const { return sequence_; } + + void PostProcess() { + assert(concurrent_memtable_writes_); + // If post info was not created there is nothing + // to process and no need to create on demand + if(post_info_created_) { + for (auto& pair : GetPostMap()) { + pair.first->BatchPostProcess(pair.second); + } + } + } + + bool SeekToColumnFamily(uint32_t column_family_id, Status* s) { + // If we are in a concurrent mode, it is the caller's responsibility + // to clone the original ColumnFamilyMemTables so that each thread + // has its own instance. Otherwise, it must be guaranteed that there + // is no concurrent access + bool found = cf_mems_->Seek(column_family_id); + if (!found) { + if (ignore_missing_column_families_) { + *s = Status::OK(); + } else { + *s = Status::InvalidArgument( + "Invalid column family specified in write batch"); + } + return false; + } + if (recovering_log_number_ != 0 && + recovering_log_number_ < cf_mems_->GetLogNumber()) { + // This is true only in recovery environment (recovering_log_number_ is + // always 0 in + // non-recovery, regular write code-path) + // * If recovering_log_number_ < cf_mems_->GetLogNumber(), this means that + // column + // family already contains updates from this log. We can't apply updates + // twice because of update-in-place or merge workloads -- ignore the + // update + *s = Status::OK(); + return false; + } + + if (has_valid_writes_ != nullptr) { + *has_valid_writes_ = true; + } + + if (log_number_ref_ > 0) { + cf_mems_->GetMemTable()->RefLogContainingPrepSection(log_number_ref_); + } + + return true; + } + + Status PutCFImpl(uint32_t column_family_id, const Slice& key, + const Slice& value, ValueType value_type) { + // optimize for non-recovery mode + if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) { + WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key, value); + return Status::OK(); + // else insert the values to the memtable right away + } + + Status seek_status; + if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) { + bool batch_boundry = false; + if (rebuilding_trx_ != nullptr) { + assert(!write_after_commit_); + // The CF is probably flushed and hence no need for insert but we still + // need to keep track of the keys for upcoming rollback/commit. + WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key, value); + batch_boundry = IsDuplicateKeySeq(column_family_id, key); + } + MaybeAdvanceSeq(batch_boundry); + return seek_status; + } + Status ret_status; + + MemTable* mem = cf_mems_->GetMemTable(); + auto* moptions = mem->GetImmutableMemTableOptions(); + // inplace_update_support is inconsistent with snapshots, and therefore with + // any kind of transactions including the ones that use seq_per_batch + assert(!seq_per_batch_ || !moptions->inplace_update_support); + if (!moptions->inplace_update_support) { + bool mem_res = + mem->Add(sequence_, value_type, key, value, + concurrent_memtable_writes_, get_post_process_info(mem), + hint_per_batch_ ? &GetHintMap()[mem] : nullptr); + if (UNLIKELY(!mem_res)) { + assert(seq_per_batch_); + ret_status = Status::TryAgain("key+seq exists"); + const bool BATCH_BOUNDRY = true; + MaybeAdvanceSeq(BATCH_BOUNDRY); + } + } else if (moptions->inplace_callback == nullptr) { + assert(!concurrent_memtable_writes_); + mem->Update(sequence_, key, value); + } else { + assert(!concurrent_memtable_writes_); + if (mem->UpdateCallback(sequence_, key, value)) { + } else { + // key not found in memtable. Do sst get, update, add + SnapshotImpl read_from_snapshot; + read_from_snapshot.number_ = sequence_; + ReadOptions ropts; + // it's going to be overwritten for sure, so no point caching data block + // containing the old version + ropts.fill_cache = false; + ropts.snapshot = &read_from_snapshot; + + std::string prev_value; + std::string merged_value; + + auto cf_handle = cf_mems_->GetColumnFamilyHandle(); + Status s = Status::NotSupported(); + if (db_ != nullptr && recovering_log_number_ == 0) { + if (cf_handle == nullptr) { + cf_handle = db_->DefaultColumnFamily(); + } + s = db_->Get(ropts, cf_handle, key, &prev_value); + } + + char* prev_buffer = const_cast(prev_value.c_str()); + uint32_t prev_size = static_cast(prev_value.size()); + auto status = moptions->inplace_callback(s.ok() ? prev_buffer : nullptr, + s.ok() ? &prev_size : nullptr, + value, &merged_value); + if (status == UpdateStatus::UPDATED_INPLACE) { + // prev_value is updated in-place with final value. + bool mem_res __attribute__((__unused__)); + mem_res = mem->Add( + sequence_, value_type, key, Slice(prev_buffer, prev_size)); + assert(mem_res); + RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN); + } else if (status == UpdateStatus::UPDATED) { + // merged_value contains the final value. + bool mem_res __attribute__((__unused__)); + mem_res = + mem->Add(sequence_, value_type, key, Slice(merged_value)); + assert(mem_res); + RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN); + } + } + } + // optimize for non-recovery mode + if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) { + assert(!write_after_commit_); + // If the ret_status is TryAgain then let the next try to add the ky to + // the rebuilding transaction object. + WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key, value); + } + // Since all Puts are logged in transaction logs (if enabled), always bump + // sequence number. Even if the update eventually fails and does not result + // in memtable add/update. + MaybeAdvanceSeq(); + CheckMemtableFull(); + return ret_status; + } + + Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + return PutCFImpl(column_family_id, key, value, kTypeValue); + } + + Status DeleteImpl(uint32_t /*column_family_id*/, const Slice& key, + const Slice& value, ValueType delete_type) { + Status ret_status; + MemTable* mem = cf_mems_->GetMemTable(); + bool mem_res = + mem->Add(sequence_, delete_type, key, value, + concurrent_memtable_writes_, get_post_process_info(mem), + hint_per_batch_ ? &GetHintMap()[mem] : nullptr); + if (UNLIKELY(!mem_res)) { + assert(seq_per_batch_); + ret_status = Status::TryAgain("key+seq exists"); + const bool BATCH_BOUNDRY = true; + MaybeAdvanceSeq(BATCH_BOUNDRY); + } + MaybeAdvanceSeq(); + CheckMemtableFull(); + return ret_status; + } + + Status DeleteCF(uint32_t column_family_id, const Slice& key) override { + // optimize for non-recovery mode + if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) { + WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key); + return Status::OK(); + // else insert the values to the memtable right away + } + + Status seek_status; + if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) { + bool batch_boundry = false; + if (rebuilding_trx_ != nullptr) { + assert(!write_after_commit_); + // The CF is probably flushed and hence no need for insert but we still + // need to keep track of the keys for upcoming rollback/commit. + WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key); + batch_boundry = IsDuplicateKeySeq(column_family_id, key); + } + MaybeAdvanceSeq(batch_boundry); + return seek_status; + } + + auto ret_status = DeleteImpl(column_family_id, key, Slice(), kTypeDeletion); + // optimize for non-recovery mode + if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) { + assert(!write_after_commit_); + // If the ret_status is TryAgain then let the next try to add the ky to + // the rebuilding transaction object. + WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key); + } + return ret_status; + } + + Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) override { + // optimize for non-recovery mode + if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) { + WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id, key); + return Status::OK(); + // else insert the values to the memtable right away + } + + Status seek_status; + if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) { + bool batch_boundry = false; + if (rebuilding_trx_ != nullptr) { + assert(!write_after_commit_); + // The CF is probably flushed and hence no need for insert but we still + // need to keep track of the keys for upcoming rollback/commit. + WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id, + key); + batch_boundry = IsDuplicateKeySeq(column_family_id, key); + } + MaybeAdvanceSeq(batch_boundry); + return seek_status; + } + + auto ret_status = + DeleteImpl(column_family_id, key, Slice(), kTypeSingleDeletion); + // optimize for non-recovery mode + if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) { + assert(!write_after_commit_); + // If the ret_status is TryAgain then let the next try to add the ky to + // the rebuilding transaction object. + WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id, key); + } + return ret_status; + } + + Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key, + const Slice& end_key) override { + // optimize for non-recovery mode + if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) { + WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id, + begin_key, end_key); + return Status::OK(); + // else insert the values to the memtable right away + } + + Status seek_status; + if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) { + bool batch_boundry = false; + if (rebuilding_trx_ != nullptr) { + assert(!write_after_commit_); + // The CF is probably flushed and hence no need for insert but we still + // need to keep track of the keys for upcoming rollback/commit. + WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id, + begin_key, end_key); + // TODO(myabandeh): when transactional DeleteRange support is added, + // check if end_key must also be added. + batch_boundry = IsDuplicateKeySeq(column_family_id, begin_key); + } + MaybeAdvanceSeq(batch_boundry); + return seek_status; + } + if (db_ != nullptr) { + auto cf_handle = cf_mems_->GetColumnFamilyHandle(); + if (cf_handle == nullptr) { + cf_handle = db_->DefaultColumnFamily(); + } + auto* cfd = reinterpret_cast(cf_handle)->cfd(); + if (!cfd->is_delete_range_supported()) { + return Status::NotSupported( + std::string("DeleteRange not supported for table type ") + + cfd->ioptions()->table_factory->Name() + " in CF " + + cfd->GetName()); + } + } + + auto ret_status = + DeleteImpl(column_family_id, begin_key, end_key, kTypeRangeDeletion); + // optimize for non-recovery mode + if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) { + assert(!write_after_commit_); + // If the ret_status is TryAgain then let the next try to add the ky to + // the rebuilding transaction object. + WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id, + begin_key, end_key); + } + return ret_status; + } + + Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + // optimize for non-recovery mode + if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) { + WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key, value); + return Status::OK(); + // else insert the values to the memtable right away + } + + Status seek_status; + if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) { + bool batch_boundry = false; + if (rebuilding_trx_ != nullptr) { + assert(!write_after_commit_); + // The CF is probably flushed and hence no need for insert but we still + // need to keep track of the keys for upcoming rollback/commit. + WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key, + value); + batch_boundry = IsDuplicateKeySeq(column_family_id, key); + } + MaybeAdvanceSeq(batch_boundry); + return seek_status; + } + + Status ret_status; + MemTable* mem = cf_mems_->GetMemTable(); + auto* moptions = mem->GetImmutableMemTableOptions(); + bool perform_merge = false; + assert(!concurrent_memtable_writes_ || + moptions->max_successive_merges == 0); + + // If we pass DB through and options.max_successive_merges is hit + // during recovery, Get() will be issued which will try to acquire + // DB mutex and cause deadlock, as DB mutex is already held. + // So we disable merge in recovery + if (moptions->max_successive_merges > 0 && db_ != nullptr && + recovering_log_number_ == 0) { + assert(!concurrent_memtable_writes_); + LookupKey lkey(key, sequence_); + + // Count the number of successive merges at the head + // of the key in the memtable + size_t num_merges = mem->CountSuccessiveMergeEntries(lkey); + + if (num_merges >= moptions->max_successive_merges) { + perform_merge = true; + } + } + + if (perform_merge) { + // 1) Get the existing value + std::string get_value; + + // Pass in the sequence number so that we also include previous merge + // operations in the same batch. + SnapshotImpl read_from_snapshot; + read_from_snapshot.number_ = sequence_; + ReadOptions read_options; + read_options.snapshot = &read_from_snapshot; + + auto cf_handle = cf_mems_->GetColumnFamilyHandle(); + if (cf_handle == nullptr) { + cf_handle = db_->DefaultColumnFamily(); + } + db_->Get(read_options, cf_handle, key, &get_value); + Slice get_value_slice = Slice(get_value); + + // 2) Apply this merge + auto merge_operator = moptions->merge_operator; + assert(merge_operator); + + std::string new_value; + + Status merge_status = MergeHelper::TimedFullMerge( + merge_operator, key, &get_value_slice, {value}, &new_value, + moptions->info_log, moptions->statistics, Env::Default()); + + if (!merge_status.ok()) { + // Failed to merge! + // Store the delta in memtable + perform_merge = false; + } else { + // 3) Add value to memtable + assert(!concurrent_memtable_writes_); + bool mem_res = mem->Add(sequence_, kTypeValue, key, new_value); + if (UNLIKELY(!mem_res)) { + assert(seq_per_batch_); + ret_status = Status::TryAgain("key+seq exists"); + const bool BATCH_BOUNDRY = true; + MaybeAdvanceSeq(BATCH_BOUNDRY); + } + } + } + + if (!perform_merge) { + // Add merge operator to memtable + bool mem_res = + mem->Add(sequence_, kTypeMerge, key, value, + concurrent_memtable_writes_, get_post_process_info(mem)); + if (UNLIKELY(!mem_res)) { + assert(seq_per_batch_); + ret_status = Status::TryAgain("key+seq exists"); + const bool BATCH_BOUNDRY = true; + MaybeAdvanceSeq(BATCH_BOUNDRY); + } + } + + // optimize for non-recovery mode + if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) { + assert(!write_after_commit_); + // If the ret_status is TryAgain then let the next try to add the ky to + // the rebuilding transaction object. + WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key, value); + } + MaybeAdvanceSeq(); + CheckMemtableFull(); + return ret_status; + } + + Status PutBlobIndexCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + // Same as PutCF except for value type. + return PutCFImpl(column_family_id, key, value, kTypeBlobIndex); + } + + void CheckMemtableFull() { + if (flush_scheduler_ != nullptr) { + auto* cfd = cf_mems_->current(); + assert(cfd != nullptr); + if (cfd->mem()->ShouldScheduleFlush() && + cfd->mem()->MarkFlushScheduled()) { + // MarkFlushScheduled only returns true if we are the one that + // should take action, so no need to dedup further + flush_scheduler_->ScheduleWork(cfd); + } + } + // check if memtable_list size exceeds max_write_buffer_size_to_maintain + if (trim_history_scheduler_ != nullptr) { + auto* cfd = cf_mems_->current(); + + assert(cfd); + assert(cfd->ioptions()); + + const size_t size_to_maintain = static_cast( + cfd->ioptions()->max_write_buffer_size_to_maintain); + + if (size_to_maintain > 0) { + MemTableList* const imm = cfd->imm(); + assert(imm); + + if (imm->HasHistory()) { + const MemTable* const mem = cfd->mem(); + assert(mem); + + if (mem->ApproximateMemoryUsageFast() + + imm->ApproximateMemoryUsageExcludingLast() >= + size_to_maintain && + imm->MarkTrimHistoryNeeded()) { + trim_history_scheduler_->ScheduleWork(cfd); + } + } + } + } + } + + // The write batch handler calls MarkBeginPrepare with unprepare set to true + // if it encounters the kTypeBeginUnprepareXID marker. + Status MarkBeginPrepare(bool unprepare) override { + assert(rebuilding_trx_ == nullptr); + assert(db_); + + if (recovering_log_number_ != 0) { + // during recovery we rebuild a hollow transaction + // from all encountered prepare sections of the wal + if (db_->allow_2pc() == false) { + return Status::NotSupported( + "WAL contains prepared transactions. Open with " + "TransactionDB::Open()."); + } + + // we are now iterating through a prepared section + rebuilding_trx_ = new WriteBatch(); + rebuilding_trx_seq_ = sequence_; + // Verify that we have matching MarkBeginPrepare/MarkEndPrepare markers. + // unprepared_batch_ should be false because it is false by default, and + // gets reset to false in MarkEndPrepare. + assert(!unprepared_batch_); + unprepared_batch_ = unprepare; + + if (has_valid_writes_ != nullptr) { + *has_valid_writes_ = true; + } + } + + return Status::OK(); + } + + Status MarkEndPrepare(const Slice& name) override { + assert(db_); + assert((rebuilding_trx_ != nullptr) == (recovering_log_number_ != 0)); + + if (recovering_log_number_ != 0) { + assert(db_->allow_2pc()); + size_t batch_cnt = + write_after_commit_ + ? 0 // 0 will disable further checks + : static_cast(sequence_ - rebuilding_trx_seq_ + 1); + db_->InsertRecoveredTransaction(recovering_log_number_, name.ToString(), + rebuilding_trx_, rebuilding_trx_seq_, + batch_cnt, unprepared_batch_); + unprepared_batch_ = false; + rebuilding_trx_ = nullptr; + } else { + assert(rebuilding_trx_ == nullptr); + } + const bool batch_boundry = true; + MaybeAdvanceSeq(batch_boundry); + + return Status::OK(); + } + + Status MarkNoop(bool empty_batch) override { + // A hack in pessimistic transaction could result into a noop at the start + // of the write batch, that should be ignored. + if (!empty_batch) { + // In the absence of Prepare markers, a kTypeNoop tag indicates the end of + // a batch. This happens when write batch commits skipping the prepare + // phase. + const bool batch_boundry = true; + MaybeAdvanceSeq(batch_boundry); + } + return Status::OK(); + } + + Status MarkCommit(const Slice& name) override { + assert(db_); + + Status s; + + if (recovering_log_number_ != 0) { + // in recovery when we encounter a commit marker + // we lookup this transaction in our set of rebuilt transactions + // and commit. + auto trx = db_->GetRecoveredTransaction(name.ToString()); + + // the log containing the prepared section may have + // been released in the last incarnation because the + // data was flushed to L0 + if (trx != nullptr) { + // at this point individual CF lognumbers will prevent + // duplicate re-insertion of values. + assert(log_number_ref_ == 0); + if (write_after_commit_) { + // write_after_commit_ can only have one batch in trx. + assert(trx->batches_.size() == 1); + const auto& batch_info = trx->batches_.begin()->second; + // all inserts must reference this trx log number + log_number_ref_ = batch_info.log_number_; + s = batch_info.batch_->Iterate(this); + log_number_ref_ = 0; + } + // else the values are already inserted before the commit + + if (s.ok()) { + db_->DeleteRecoveredTransaction(name.ToString()); + } + if (has_valid_writes_ != nullptr) { + *has_valid_writes_ = true; + } + } + } else { + // When writes are not delayed until commit, there is no disconnect + // between a memtable write and the WAL that supports it. So the commit + // need not reference any log as the only log to which it depends. + assert(!write_after_commit_ || log_number_ref_ > 0); + } + const bool batch_boundry = true; + MaybeAdvanceSeq(batch_boundry); + + return s; + } + + Status MarkRollback(const Slice& name) override { + assert(db_); + + if (recovering_log_number_ != 0) { + auto trx = db_->GetRecoveredTransaction(name.ToString()); + + // the log containing the transactions prep section + // may have been released in the previous incarnation + // because we knew it had been rolled back + if (trx != nullptr) { + db_->DeleteRecoveredTransaction(name.ToString()); + } + } else { + // in non recovery we simply ignore this tag + } + + const bool batch_boundry = true; + MaybeAdvanceSeq(batch_boundry); + + return Status::OK(); + } + + private: + MemTablePostProcessInfo* get_post_process_info(MemTable* mem) { + if (!concurrent_memtable_writes_) { + // No need to batch counters locally if we don't use concurrent mode. + return nullptr; + } + return &GetPostMap()[mem]; + } +}; + +// This function can only be called in these conditions: +// 1) During Recovery() +// 2) During Write(), in a single-threaded write thread +// 3) During Write(), in a concurrent context where memtables has been cloned +// The reason is that it calls memtables->Seek(), which has a stateful cache +Status WriteBatchInternal::InsertInto( + WriteThread::WriteGroup& write_group, SequenceNumber sequence, + ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler, + TrimHistoryScheduler* trim_history_scheduler, + bool ignore_missing_column_families, uint64_t recovery_log_number, DB* db, + bool concurrent_memtable_writes, bool seq_per_batch, bool batch_per_txn) { + MemTableInserter inserter( + sequence, memtables, flush_scheduler, trim_history_scheduler, + ignore_missing_column_families, recovery_log_number, db, + concurrent_memtable_writes, nullptr /*has_valid_writes*/, seq_per_batch, + batch_per_txn); + for (auto w : write_group) { + if (w->CallbackFailed()) { + continue; + } + w->sequence = inserter.sequence(); + if (!w->ShouldWriteToMemtable()) { + // In seq_per_batch_ mode this advances the seq by one. + inserter.MaybeAdvanceSeq(true); + continue; + } + SetSequence(w->batch, inserter.sequence()); + inserter.set_log_number_ref(w->log_ref); + w->status = w->batch->Iterate(&inserter); + if (!w->status.ok()) { + return w->status; + } + assert(!seq_per_batch || w->batch_cnt != 0); + assert(!seq_per_batch || inserter.sequence() - w->sequence == w->batch_cnt); + } + return Status::OK(); +} + +Status WriteBatchInternal::InsertInto( + WriteThread::Writer* writer, SequenceNumber sequence, + ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler, + TrimHistoryScheduler* trim_history_scheduler, + bool ignore_missing_column_families, uint64_t log_number, DB* db, + bool concurrent_memtable_writes, bool seq_per_batch, size_t batch_cnt, + bool batch_per_txn, bool hint_per_batch) { +#ifdef NDEBUG + (void)batch_cnt; +#endif + assert(writer->ShouldWriteToMemtable()); + MemTableInserter inserter( + sequence, memtables, flush_scheduler, trim_history_scheduler, + ignore_missing_column_families, log_number, db, + concurrent_memtable_writes, nullptr /*has_valid_writes*/, seq_per_batch, + batch_per_txn, hint_per_batch); + SetSequence(writer->batch, sequence); + inserter.set_log_number_ref(writer->log_ref); + Status s = writer->batch->Iterate(&inserter); + assert(!seq_per_batch || batch_cnt != 0); + assert(!seq_per_batch || inserter.sequence() - sequence == batch_cnt); + if (concurrent_memtable_writes) { + inserter.PostProcess(); + } + return s; +} + +Status WriteBatchInternal::InsertInto( + const WriteBatch* batch, ColumnFamilyMemTables* memtables, + FlushScheduler* flush_scheduler, + TrimHistoryScheduler* trim_history_scheduler, + bool ignore_missing_column_families, uint64_t log_number, DB* db, + bool concurrent_memtable_writes, SequenceNumber* next_seq, + bool* has_valid_writes, bool seq_per_batch, bool batch_per_txn) { + MemTableInserter inserter(Sequence(batch), memtables, flush_scheduler, + trim_history_scheduler, + ignore_missing_column_families, log_number, db, + concurrent_memtable_writes, has_valid_writes, + seq_per_batch, batch_per_txn); + Status s = batch->Iterate(&inserter); + if (next_seq != nullptr) { + *next_seq = inserter.sequence(); + } + if (concurrent_memtable_writes) { + inserter.PostProcess(); + } + return s; +} + +Status WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) { + assert(contents.size() >= WriteBatchInternal::kHeader); + b->rep_.assign(contents.data(), contents.size()); + b->content_flags_.store(ContentFlags::DEFERRED, std::memory_order_relaxed); + return Status::OK(); +} + +Status WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src, + const bool wal_only) { + size_t src_len; + int src_count; + uint32_t src_flags; + + const SavePoint& batch_end = src->GetWalTerminationPoint(); + + if (wal_only && !batch_end.is_cleared()) { + src_len = batch_end.size - WriteBatchInternal::kHeader; + src_count = batch_end.count; + src_flags = batch_end.content_flags; + } else { + src_len = src->rep_.size() - WriteBatchInternal::kHeader; + src_count = Count(src); + src_flags = src->content_flags_.load(std::memory_order_relaxed); + } + + SetCount(dst, Count(dst) + src_count); + assert(src->rep_.size() >= WriteBatchInternal::kHeader); + dst->rep_.append(src->rep_.data() + WriteBatchInternal::kHeader, src_len); + dst->content_flags_.store( + dst->content_flags_.load(std::memory_order_relaxed) | src_flags, + std::memory_order_relaxed); + return Status::OK(); +} + +size_t WriteBatchInternal::AppendedByteSize(size_t leftByteSize, + size_t rightByteSize) { + if (leftByteSize == 0 || rightByteSize == 0) { + return leftByteSize + rightByteSize; + } else { + return leftByteSize + rightByteSize - WriteBatchInternal::kHeader; + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/write_batch_base.cc b/src/rocksdb/db/write_batch_base.cc new file mode 100644 index 000000000..e4c0e74bd --- /dev/null +++ b/src/rocksdb/db/write_batch_base.cc @@ -0,0 +1,94 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/write_batch_base.h" + +#include + +#include "rocksdb/slice.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// Simple implementation of SlicePart variants of Put(). Child classes +// can override these method with more performant solutions if they choose. +Status WriteBatchBase::Put(ColumnFamilyHandle* column_family, + const SliceParts& key, const SliceParts& value) { + std::string key_buf, value_buf; + Slice key_slice(key, &key_buf); + Slice value_slice(value, &value_buf); + + return Put(column_family, key_slice, value_slice); +} + +Status WriteBatchBase::Put(const SliceParts& key, const SliceParts& value) { + std::string key_buf, value_buf; + Slice key_slice(key, &key_buf); + Slice value_slice(value, &value_buf); + + return Put(key_slice, value_slice); +} + +Status WriteBatchBase::Delete(ColumnFamilyHandle* column_family, + const SliceParts& key) { + std::string key_buf; + Slice key_slice(key, &key_buf); + return Delete(column_family, key_slice); +} + +Status WriteBatchBase::Delete(const SliceParts& key) { + std::string key_buf; + Slice key_slice(key, &key_buf); + return Delete(key_slice); +} + +Status WriteBatchBase::SingleDelete(ColumnFamilyHandle* column_family, + const SliceParts& key) { + std::string key_buf; + Slice key_slice(key, &key_buf); + return SingleDelete(column_family, key_slice); +} + +Status WriteBatchBase::SingleDelete(const SliceParts& key) { + std::string key_buf; + Slice key_slice(key, &key_buf); + return SingleDelete(key_slice); +} + +Status WriteBatchBase::DeleteRange(ColumnFamilyHandle* column_family, + const SliceParts& begin_key, + const SliceParts& end_key) { + std::string begin_key_buf, end_key_buf; + Slice begin_key_slice(begin_key, &begin_key_buf); + Slice end_key_slice(end_key, &end_key_buf); + return DeleteRange(column_family, begin_key_slice, end_key_slice); +} + +Status WriteBatchBase::DeleteRange(const SliceParts& begin_key, + const SliceParts& end_key) { + std::string begin_key_buf, end_key_buf; + Slice begin_key_slice(begin_key, &begin_key_buf); + Slice end_key_slice(end_key, &end_key_buf); + return DeleteRange(begin_key_slice, end_key_slice); +} + +Status WriteBatchBase::Merge(ColumnFamilyHandle* column_family, + const SliceParts& key, const SliceParts& value) { + std::string key_buf, value_buf; + Slice key_slice(key, &key_buf); + Slice value_slice(value, &value_buf); + + return Merge(column_family, key_slice, value_slice); +} + +Status WriteBatchBase::Merge(const SliceParts& key, const SliceParts& value) { + std::string key_buf, value_buf; + Slice key_slice(key, &key_buf); + Slice value_slice(value, &value_buf); + + return Merge(key_slice, value_slice); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/write_batch_internal.h b/src/rocksdb/db/write_batch_internal.h new file mode 100644 index 000000000..30c489965 --- /dev/null +++ b/src/rocksdb/db/write_batch_internal.h @@ -0,0 +1,250 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include "db/flush_scheduler.h" +#include "db/trim_history_scheduler.h" +#include "db/write_thread.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/types.h" +#include "rocksdb/write_batch.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +class MemTable; +class FlushScheduler; +class ColumnFamilyData; + +class ColumnFamilyMemTables { + public: + virtual ~ColumnFamilyMemTables() {} + virtual bool Seek(uint32_t column_family_id) = 0; + // returns true if the update to memtable should be ignored + // (useful when recovering from log whose updates have already + // been processed) + virtual uint64_t GetLogNumber() const = 0; + virtual MemTable* GetMemTable() const = 0; + virtual ColumnFamilyHandle* GetColumnFamilyHandle() = 0; + virtual ColumnFamilyData* current() { return nullptr; } +}; + +class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables { + public: + explicit ColumnFamilyMemTablesDefault(MemTable* mem) + : ok_(false), mem_(mem) {} + + bool Seek(uint32_t column_family_id) override { + ok_ = (column_family_id == 0); + return ok_; + } + + uint64_t GetLogNumber() const override { return 0; } + + MemTable* GetMemTable() const override { + assert(ok_); + return mem_; + } + + ColumnFamilyHandle* GetColumnFamilyHandle() override { return nullptr; } + + private: + bool ok_; + MemTable* mem_; +}; + +// WriteBatchInternal provides static methods for manipulating a +// WriteBatch that we don't want in the public WriteBatch interface. +class WriteBatchInternal { + public: + + // WriteBatch header has an 8-byte sequence number followed by a 4-byte count. + static const size_t kHeader = 12; + + // WriteBatch methods with column_family_id instead of ColumnFamilyHandle* + static Status Put(WriteBatch* batch, uint32_t column_family_id, + const Slice& key, const Slice& value); + + static Status Put(WriteBatch* batch, uint32_t column_family_id, + const SliceParts& key, const SliceParts& value); + + static Status Delete(WriteBatch* batch, uint32_t column_family_id, + const SliceParts& key); + + static Status Delete(WriteBatch* batch, uint32_t column_family_id, + const Slice& key); + + static Status SingleDelete(WriteBatch* batch, uint32_t column_family_id, + const SliceParts& key); + + static Status SingleDelete(WriteBatch* batch, uint32_t column_family_id, + const Slice& key); + + static Status DeleteRange(WriteBatch* b, uint32_t column_family_id, + const Slice& begin_key, const Slice& end_key); + + static Status DeleteRange(WriteBatch* b, uint32_t column_family_id, + const SliceParts& begin_key, + const SliceParts& end_key); + + static Status Merge(WriteBatch* batch, uint32_t column_family_id, + const Slice& key, const Slice& value); + + static Status Merge(WriteBatch* batch, uint32_t column_family_id, + const SliceParts& key, const SliceParts& value); + + static Status PutBlobIndex(WriteBatch* batch, uint32_t column_family_id, + const Slice& key, const Slice& value); + + static Status MarkEndPrepare(WriteBatch* batch, const Slice& xid, + const bool write_after_commit = true, + const bool unprepared_batch = false); + + static Status MarkRollback(WriteBatch* batch, const Slice& xid); + + static Status MarkCommit(WriteBatch* batch, const Slice& xid); + + static Status InsertNoop(WriteBatch* batch); + + // Return the number of entries in the batch. + static uint32_t Count(const WriteBatch* batch); + + // Set the count for the number of entries in the batch. + static void SetCount(WriteBatch* batch, uint32_t n); + + // Return the sequence number for the start of this batch. + static SequenceNumber Sequence(const WriteBatch* batch); + + // Store the specified number as the sequence number for the start of + // this batch. + static void SetSequence(WriteBatch* batch, SequenceNumber seq); + + // Returns the offset of the first entry in the batch. + // This offset is only valid if the batch is not empty. + static size_t GetFirstOffset(WriteBatch* batch); + + static Slice Contents(const WriteBatch* batch) { + return Slice(batch->rep_); + } + + static size_t ByteSize(const WriteBatch* batch) { + return batch->rep_.size(); + } + + static Status SetContents(WriteBatch* batch, const Slice& contents); + + static Status CheckSlicePartsLength(const SliceParts& key, + const SliceParts& value); + + // Inserts batches[i] into memtable, for i in 0..num_batches-1 inclusive. + // + // If ignore_missing_column_families == true. WriteBatch + // referencing non-existing column family will be ignored. + // If ignore_missing_column_families == false, processing of the + // batches will be stopped if a reference is found to a non-existing + // column family and InvalidArgument() will be returned. The writes + // in batches may be only partially applied at that point. + // + // If log_number is non-zero, the memtable will be updated only if + // memtables->GetLogNumber() >= log_number. + // + // If flush_scheduler is non-null, it will be invoked if the memtable + // should be flushed. + // + // Under concurrent use, the caller is responsible for making sure that + // the memtables object itself is thread-local. + static Status InsertInto( + WriteThread::WriteGroup& write_group, SequenceNumber sequence, + ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler, + TrimHistoryScheduler* trim_history_scheduler, + bool ignore_missing_column_families = false, uint64_t log_number = 0, + DB* db = nullptr, bool concurrent_memtable_writes = false, + bool seq_per_batch = false, bool batch_per_txn = true); + + // Convenience form of InsertInto when you have only one batch + // next_seq returns the seq after last sequence number used in MemTable insert + static Status InsertInto( + const WriteBatch* batch, ColumnFamilyMemTables* memtables, + FlushScheduler* flush_scheduler, + TrimHistoryScheduler* trim_history_scheduler, + bool ignore_missing_column_families = false, uint64_t log_number = 0, + DB* db = nullptr, bool concurrent_memtable_writes = false, + SequenceNumber* next_seq = nullptr, bool* has_valid_writes = nullptr, + bool seq_per_batch = false, bool batch_per_txn = true); + + static Status InsertInto(WriteThread::Writer* writer, SequenceNumber sequence, + ColumnFamilyMemTables* memtables, + FlushScheduler* flush_scheduler, + TrimHistoryScheduler* trim_history_scheduler, + bool ignore_missing_column_families = false, + uint64_t log_number = 0, DB* db = nullptr, + bool concurrent_memtable_writes = false, + bool seq_per_batch = false, size_t batch_cnt = 0, + bool batch_per_txn = true, + bool hint_per_batch = false); + + static Status Append(WriteBatch* dst, const WriteBatch* src, + const bool WAL_only = false); + + // Returns the byte size of appending a WriteBatch with ByteSize + // leftByteSize and a WriteBatch with ByteSize rightByteSize + static size_t AppendedByteSize(size_t leftByteSize, size_t rightByteSize); + + // Iterate over [begin, end) range of a write batch + static Status Iterate(const WriteBatch* wb, WriteBatch::Handler* handler, + size_t begin, size_t end); + + // This write batch includes the latest state that should be persisted. Such + // state meant to be used only during recovery. + static void SetAsLastestPersistentState(WriteBatch* b); + static bool IsLatestPersistentState(const WriteBatch* b); +}; + +// LocalSavePoint is similar to a scope guard +class LocalSavePoint { + public: + explicit LocalSavePoint(WriteBatch* batch) + : batch_(batch), + savepoint_(batch->GetDataSize(), batch->Count(), + batch->content_flags_.load(std::memory_order_relaxed)) +#ifndef NDEBUG + , + committed_(false) +#endif + { + } + +#ifndef NDEBUG + ~LocalSavePoint() { assert(committed_); } +#endif + Status commit() { +#ifndef NDEBUG + committed_ = true; +#endif + if (batch_->max_bytes_ && batch_->rep_.size() > batch_->max_bytes_) { + batch_->rep_.resize(savepoint_.size); + WriteBatchInternal::SetCount(batch_, savepoint_.count); + batch_->content_flags_.store(savepoint_.content_flags, + std::memory_order_relaxed); + return Status::MemoryLimit(); + } + return Status::OK(); + } + + private: + WriteBatch* batch_; + SavePoint savepoint_; +#ifndef NDEBUG + bool committed_; +#endif +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/write_batch_test.cc b/src/rocksdb/db/write_batch_test.cc new file mode 100644 index 000000000..84f9a45ec --- /dev/null +++ b/src/rocksdb/db/write_batch_test.cc @@ -0,0 +1,888 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/db.h" + +#include +#include "db/column_family.h" +#include "db/memtable.h" +#include "db/write_batch_internal.h" +#include "rocksdb/env.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "rocksdb/write_buffer_manager.h" +#include "table/scoped_arena_iterator.h" +#include "test_util/testharness.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +static std::string PrintContents(WriteBatch* b) { + InternalKeyComparator cmp(BytewiseComparator()); + auto factory = std::make_shared(); + Options options; + options.memtable_factory = factory; + ImmutableCFOptions ioptions(options); + WriteBufferManager wb(options.db_write_buffer_size); + MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb, + kMaxSequenceNumber, 0 /* column_family_id */); + mem->Ref(); + std::string state; + ColumnFamilyMemTablesDefault cf_mems_default(mem); + Status s = + WriteBatchInternal::InsertInto(b, &cf_mems_default, nullptr, nullptr); + uint32_t count = 0; + int put_count = 0; + int delete_count = 0; + int single_delete_count = 0; + int delete_range_count = 0; + int merge_count = 0; + for (int i = 0; i < 2; ++i) { + Arena arena; + ScopedArenaIterator arena_iter_guard; + std::unique_ptr iter_guard; + InternalIterator* iter; + if (i == 0) { + iter = mem->NewIterator(ReadOptions(), &arena); + arena_iter_guard.set(iter); + } else { + iter = mem->NewRangeTombstoneIterator(ReadOptions(), + kMaxSequenceNumber /* read_seq */); + iter_guard.reset(iter); + } + if (iter == nullptr) { + continue; + } + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ParsedInternalKey ikey; + ikey.clear(); + EXPECT_TRUE(ParseInternalKey(iter->key(), &ikey)); + switch (ikey.type) { + case kTypeValue: + state.append("Put("); + state.append(ikey.user_key.ToString()); + state.append(", "); + state.append(iter->value().ToString()); + state.append(")"); + count++; + put_count++; + break; + case kTypeDeletion: + state.append("Delete("); + state.append(ikey.user_key.ToString()); + state.append(")"); + count++; + delete_count++; + break; + case kTypeSingleDeletion: + state.append("SingleDelete("); + state.append(ikey.user_key.ToString()); + state.append(")"); + count++; + single_delete_count++; + break; + case kTypeRangeDeletion: + state.append("DeleteRange("); + state.append(ikey.user_key.ToString()); + state.append(", "); + state.append(iter->value().ToString()); + state.append(")"); + count++; + delete_range_count++; + break; + case kTypeMerge: + state.append("Merge("); + state.append(ikey.user_key.ToString()); + state.append(", "); + state.append(iter->value().ToString()); + state.append(")"); + count++; + merge_count++; + break; + default: + assert(false); + break; + } + state.append("@"); + state.append(NumberToString(ikey.sequence)); + } + } + EXPECT_EQ(b->HasPut(), put_count > 0); + EXPECT_EQ(b->HasDelete(), delete_count > 0); + EXPECT_EQ(b->HasSingleDelete(), single_delete_count > 0); + EXPECT_EQ(b->HasDeleteRange(), delete_range_count > 0); + EXPECT_EQ(b->HasMerge(), merge_count > 0); + if (!s.ok()) { + state.append(s.ToString()); + } else if (count != WriteBatchInternal::Count(b)) { + state.append("CountMismatch()"); + } + delete mem->Unref(); + return state; +} + +class WriteBatchTest : public testing::Test {}; + +TEST_F(WriteBatchTest, Empty) { + WriteBatch batch; + ASSERT_EQ("", PrintContents(&batch)); + ASSERT_EQ(0u, WriteBatchInternal::Count(&batch)); + ASSERT_EQ(0u, batch.Count()); +} + +TEST_F(WriteBatchTest, Multiple) { + WriteBatch batch; + batch.Put(Slice("foo"), Slice("bar")); + batch.Delete(Slice("box")); + batch.DeleteRange(Slice("bar"), Slice("foo")); + batch.Put(Slice("baz"), Slice("boo")); + WriteBatchInternal::SetSequence(&batch, 100); + ASSERT_EQ(100U, WriteBatchInternal::Sequence(&batch)); + ASSERT_EQ(4u, WriteBatchInternal::Count(&batch)); + ASSERT_EQ( + "Put(baz, boo)@103" + "Delete(box)@101" + "Put(foo, bar)@100" + "DeleteRange(bar, foo)@102", + PrintContents(&batch)); + ASSERT_EQ(4u, batch.Count()); +} + +TEST_F(WriteBatchTest, Corruption) { + WriteBatch batch; + batch.Put(Slice("foo"), Slice("bar")); + batch.Delete(Slice("box")); + WriteBatchInternal::SetSequence(&batch, 200); + Slice contents = WriteBatchInternal::Contents(&batch); + WriteBatchInternal::SetContents(&batch, + Slice(contents.data(),contents.size()-1)); + ASSERT_EQ("Put(foo, bar)@200" + "Corruption: bad WriteBatch Delete", + PrintContents(&batch)); +} + +TEST_F(WriteBatchTest, Append) { + WriteBatch b1, b2; + WriteBatchInternal::SetSequence(&b1, 200); + WriteBatchInternal::SetSequence(&b2, 300); + WriteBatchInternal::Append(&b1, &b2); + ASSERT_EQ("", + PrintContents(&b1)); + ASSERT_EQ(0u, b1.Count()); + b2.Put("a", "va"); + WriteBatchInternal::Append(&b1, &b2); + ASSERT_EQ("Put(a, va)@200", + PrintContents(&b1)); + ASSERT_EQ(1u, b1.Count()); + b2.Clear(); + b2.Put("b", "vb"); + WriteBatchInternal::Append(&b1, &b2); + ASSERT_EQ("Put(a, va)@200" + "Put(b, vb)@201", + PrintContents(&b1)); + ASSERT_EQ(2u, b1.Count()); + b2.Delete("foo"); + WriteBatchInternal::Append(&b1, &b2); + ASSERT_EQ("Put(a, va)@200" + "Put(b, vb)@202" + "Put(b, vb)@201" + "Delete(foo)@203", + PrintContents(&b1)); + ASSERT_EQ(4u, b1.Count()); + b2.Clear(); + b2.Put("c", "cc"); + b2.Put("d", "dd"); + b2.MarkWalTerminationPoint(); + b2.Put("e", "ee"); + WriteBatchInternal::Append(&b1, &b2, /*wal only*/ true); + ASSERT_EQ( + "Put(a, va)@200" + "Put(b, vb)@202" + "Put(b, vb)@201" + "Put(c, cc)@204" + "Put(d, dd)@205" + "Delete(foo)@203", + PrintContents(&b1)); + ASSERT_EQ(6u, b1.Count()); + ASSERT_EQ( + "Put(c, cc)@0" + "Put(d, dd)@1" + "Put(e, ee)@2", + PrintContents(&b2)); + ASSERT_EQ(3u, b2.Count()); +} + +TEST_F(WriteBatchTest, SingleDeletion) { + WriteBatch batch; + WriteBatchInternal::SetSequence(&batch, 100); + ASSERT_EQ("", PrintContents(&batch)); + ASSERT_EQ(0u, batch.Count()); + batch.Put("a", "va"); + ASSERT_EQ("Put(a, va)@100", PrintContents(&batch)); + ASSERT_EQ(1u, batch.Count()); + batch.SingleDelete("a"); + ASSERT_EQ( + "SingleDelete(a)@101" + "Put(a, va)@100", + PrintContents(&batch)); + ASSERT_EQ(2u, batch.Count()); +} + +namespace { + struct TestHandler : public WriteBatch::Handler { + std::string seen; + Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + if (column_family_id == 0) { + seen += "Put(" + key.ToString() + ", " + value.ToString() + ")"; + } else { + seen += "PutCF(" + ToString(column_family_id) + ", " + + key.ToString() + ", " + value.ToString() + ")"; + } + return Status::OK(); + } + Status DeleteCF(uint32_t column_family_id, const Slice& key) override { + if (column_family_id == 0) { + seen += "Delete(" + key.ToString() + ")"; + } else { + seen += "DeleteCF(" + ToString(column_family_id) + ", " + + key.ToString() + ")"; + } + return Status::OK(); + } + Status SingleDeleteCF(uint32_t column_family_id, + const Slice& key) override { + if (column_family_id == 0) { + seen += "SingleDelete(" + key.ToString() + ")"; + } else { + seen += "SingleDeleteCF(" + ToString(column_family_id) + ", " + + key.ToString() + ")"; + } + return Status::OK(); + } + Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key, + const Slice& end_key) override { + if (column_family_id == 0) { + seen += "DeleteRange(" + begin_key.ToString() + ", " + + end_key.ToString() + ")"; + } else { + seen += "DeleteRangeCF(" + ToString(column_family_id) + ", " + + begin_key.ToString() + ", " + end_key.ToString() + ")"; + } + return Status::OK(); + } + Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + if (column_family_id == 0) { + seen += "Merge(" + key.ToString() + ", " + value.ToString() + ")"; + } else { + seen += "MergeCF(" + ToString(column_family_id) + ", " + + key.ToString() + ", " + value.ToString() + ")"; + } + return Status::OK(); + } + void LogData(const Slice& blob) override { + seen += "LogData(" + blob.ToString() + ")"; + } + Status MarkBeginPrepare(bool unprepare) override { + seen += + "MarkBeginPrepare(" + std::string(unprepare ? "true" : "false") + ")"; + return Status::OK(); + } + Status MarkEndPrepare(const Slice& xid) override { + seen += "MarkEndPrepare(" + xid.ToString() + ")"; + return Status::OK(); + } + Status MarkNoop(bool empty_batch) override { + seen += "MarkNoop(" + std::string(empty_batch ? "true" : "false") + ")"; + return Status::OK(); + } + Status MarkCommit(const Slice& xid) override { + seen += "MarkCommit(" + xid.ToString() + ")"; + return Status::OK(); + } + Status MarkRollback(const Slice& xid) override { + seen += "MarkRollback(" + xid.ToString() + ")"; + return Status::OK(); + } + }; +} + +TEST_F(WriteBatchTest, PutNotImplemented) { + WriteBatch batch; + batch.Put(Slice("k1"), Slice("v1")); + ASSERT_EQ(1u, batch.Count()); + ASSERT_EQ("Put(k1, v1)@0", PrintContents(&batch)); + + WriteBatch::Handler handler; + ASSERT_OK(batch.Iterate(&handler)); +} + +TEST_F(WriteBatchTest, DeleteNotImplemented) { + WriteBatch batch; + batch.Delete(Slice("k2")); + ASSERT_EQ(1u, batch.Count()); + ASSERT_EQ("Delete(k2)@0", PrintContents(&batch)); + + WriteBatch::Handler handler; + ASSERT_OK(batch.Iterate(&handler)); +} + +TEST_F(WriteBatchTest, SingleDeleteNotImplemented) { + WriteBatch batch; + batch.SingleDelete(Slice("k2")); + ASSERT_EQ(1u, batch.Count()); + ASSERT_EQ("SingleDelete(k2)@0", PrintContents(&batch)); + + WriteBatch::Handler handler; + ASSERT_OK(batch.Iterate(&handler)); +} + +TEST_F(WriteBatchTest, MergeNotImplemented) { + WriteBatch batch; + batch.Merge(Slice("foo"), Slice("bar")); + ASSERT_EQ(1u, batch.Count()); + ASSERT_EQ("Merge(foo, bar)@0", PrintContents(&batch)); + + WriteBatch::Handler handler; + ASSERT_OK(batch.Iterate(&handler)); +} + +TEST_F(WriteBatchTest, Blob) { + WriteBatch batch; + batch.Put(Slice("k1"), Slice("v1")); + batch.Put(Slice("k2"), Slice("v2")); + batch.Put(Slice("k3"), Slice("v3")); + batch.PutLogData(Slice("blob1")); + batch.Delete(Slice("k2")); + batch.SingleDelete(Slice("k3")); + batch.PutLogData(Slice("blob2")); + batch.Merge(Slice("foo"), Slice("bar")); + ASSERT_EQ(6u, batch.Count()); + ASSERT_EQ( + "Merge(foo, bar)@5" + "Put(k1, v1)@0" + "Delete(k2)@3" + "Put(k2, v2)@1" + "SingleDelete(k3)@4" + "Put(k3, v3)@2", + PrintContents(&batch)); + + TestHandler handler; + batch.Iterate(&handler); + ASSERT_EQ( + "Put(k1, v1)" + "Put(k2, v2)" + "Put(k3, v3)" + "LogData(blob1)" + "Delete(k2)" + "SingleDelete(k3)" + "LogData(blob2)" + "Merge(foo, bar)", + handler.seen); +} + +TEST_F(WriteBatchTest, PrepareCommit) { + WriteBatch batch; + WriteBatchInternal::InsertNoop(&batch); + batch.Put(Slice("k1"), Slice("v1")); + batch.Put(Slice("k2"), Slice("v2")); + batch.SetSavePoint(); + WriteBatchInternal::MarkEndPrepare(&batch, Slice("xid1")); + Status s = batch.RollbackToSavePoint(); + ASSERT_EQ(s, Status::NotFound()); + WriteBatchInternal::MarkCommit(&batch, Slice("xid1")); + WriteBatchInternal::MarkRollback(&batch, Slice("xid1")); + ASSERT_EQ(2u, batch.Count()); + + TestHandler handler; + batch.Iterate(&handler); + ASSERT_EQ( + "MarkBeginPrepare(false)" + "Put(k1, v1)" + "Put(k2, v2)" + "MarkEndPrepare(xid1)" + "MarkCommit(xid1)" + "MarkRollback(xid1)", + handler.seen); +} + +// It requires more than 30GB of memory to run the test. With single memory +// allocation of more than 30GB. +// Not all platform can run it. Also it runs a long time. So disable it. +TEST_F(WriteBatchTest, DISABLED_ManyUpdates) { + // Insert key and value of 3GB and push total batch size to 12GB. + static const size_t kKeyValueSize = 4u; + static const uint32_t kNumUpdates = uint32_t(3 << 30); + std::string raw(kKeyValueSize, 'A'); + WriteBatch batch(kNumUpdates * (4 + kKeyValueSize * 2) + 1024u); + char c = 'A'; + for (uint32_t i = 0; i < kNumUpdates; i++) { + if (c > 'Z') { + c = 'A'; + } + raw[0] = c; + raw[raw.length() - 1] = c; + c++; + batch.Put(raw, raw); + } + + ASSERT_EQ(kNumUpdates, batch.Count()); + + struct NoopHandler : public WriteBatch::Handler { + uint32_t num_seen = 0; + char expected_char = 'A'; + Status PutCF(uint32_t /*column_family_id*/, const Slice& key, + const Slice& value) override { + EXPECT_EQ(kKeyValueSize, key.size()); + EXPECT_EQ(kKeyValueSize, value.size()); + EXPECT_EQ(expected_char, key[0]); + EXPECT_EQ(expected_char, value[0]); + EXPECT_EQ(expected_char, key[kKeyValueSize - 1]); + EXPECT_EQ(expected_char, value[kKeyValueSize - 1]); + expected_char++; + if (expected_char > 'Z') { + expected_char = 'A'; + } + ++num_seen; + return Status::OK(); + } + Status DeleteCF(uint32_t /*column_family_id*/, + const Slice& /*key*/) override { + ADD_FAILURE(); + return Status::OK(); + } + Status SingleDeleteCF(uint32_t /*column_family_id*/, + const Slice& /*key*/) override { + ADD_FAILURE(); + return Status::OK(); + } + Status MergeCF(uint32_t /*column_family_id*/, const Slice& /*key*/, + const Slice& /*value*/) override { + ADD_FAILURE(); + return Status::OK(); + } + void LogData(const Slice& /*blob*/) override { ADD_FAILURE(); } + bool Continue() override { return num_seen < kNumUpdates; } + } handler; + + batch.Iterate(&handler); + ASSERT_EQ(kNumUpdates, handler.num_seen); +} + +// The test requires more than 18GB memory to run it, with single memory +// allocation of more than 12GB. Not all the platform can run it. So disable it. +TEST_F(WriteBatchTest, DISABLED_LargeKeyValue) { + // Insert key and value of 3GB and push total batch size to 12GB. + static const size_t kKeyValueSize = 3221225472u; + std::string raw(kKeyValueSize, 'A'); + WriteBatch batch(size_t(12884901888ull + 1024u)); + for (char i = 0; i < 2; i++) { + raw[0] = 'A' + i; + raw[raw.length() - 1] = 'A' - i; + batch.Put(raw, raw); + } + + ASSERT_EQ(2u, batch.Count()); + + struct NoopHandler : public WriteBatch::Handler { + int num_seen = 0; + Status PutCF(uint32_t /*column_family_id*/, const Slice& key, + const Slice& value) override { + EXPECT_EQ(kKeyValueSize, key.size()); + EXPECT_EQ(kKeyValueSize, value.size()); + EXPECT_EQ('A' + num_seen, key[0]); + EXPECT_EQ('A' + num_seen, value[0]); + EXPECT_EQ('A' - num_seen, key[kKeyValueSize - 1]); + EXPECT_EQ('A' - num_seen, value[kKeyValueSize - 1]); + ++num_seen; + return Status::OK(); + } + Status DeleteCF(uint32_t /*column_family_id*/, + const Slice& /*key*/) override { + ADD_FAILURE(); + return Status::OK(); + } + Status SingleDeleteCF(uint32_t /*column_family_id*/, + const Slice& /*key*/) override { + ADD_FAILURE(); + return Status::OK(); + } + Status MergeCF(uint32_t /*column_family_id*/, const Slice& /*key*/, + const Slice& /*value*/) override { + ADD_FAILURE(); + return Status::OK(); + } + void LogData(const Slice& /*blob*/) override { ADD_FAILURE(); } + bool Continue() override { return num_seen < 2; } + } handler; + + batch.Iterate(&handler); + ASSERT_EQ(2, handler.num_seen); +} + +TEST_F(WriteBatchTest, Continue) { + WriteBatch batch; + + struct Handler : public TestHandler { + int num_seen = 0; + Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + ++num_seen; + return TestHandler::PutCF(column_family_id, key, value); + } + Status DeleteCF(uint32_t column_family_id, const Slice& key) override { + ++num_seen; + return TestHandler::DeleteCF(column_family_id, key); + } + Status SingleDeleteCF(uint32_t column_family_id, + const Slice& key) override { + ++num_seen; + return TestHandler::SingleDeleteCF(column_family_id, key); + } + Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + ++num_seen; + return TestHandler::MergeCF(column_family_id, key, value); + } + void LogData(const Slice& blob) override { + ++num_seen; + TestHandler::LogData(blob); + } + bool Continue() override { return num_seen < 5; } + } handler; + + batch.Put(Slice("k1"), Slice("v1")); + batch.Put(Slice("k2"), Slice("v2")); + batch.PutLogData(Slice("blob1")); + batch.Delete(Slice("k1")); + batch.SingleDelete(Slice("k2")); + batch.PutLogData(Slice("blob2")); + batch.Merge(Slice("foo"), Slice("bar")); + batch.Iterate(&handler); + ASSERT_EQ( + "Put(k1, v1)" + "Put(k2, v2)" + "LogData(blob1)" + "Delete(k1)" + "SingleDelete(k2)", + handler.seen); +} + +TEST_F(WriteBatchTest, PutGatherSlices) { + WriteBatch batch; + batch.Put(Slice("foo"), Slice("bar")); + + { + // Try a write where the key is one slice but the value is two + Slice key_slice("baz"); + Slice value_slices[2] = { Slice("header"), Slice("payload") }; + batch.Put(SliceParts(&key_slice, 1), + SliceParts(value_slices, 2)); + } + + { + // One where the key is composite but the value is a single slice + Slice key_slices[3] = { Slice("key"), Slice("part2"), Slice("part3") }; + Slice value_slice("value"); + batch.Put(SliceParts(key_slices, 3), + SliceParts(&value_slice, 1)); + } + + WriteBatchInternal::SetSequence(&batch, 100); + ASSERT_EQ("Put(baz, headerpayload)@101" + "Put(foo, bar)@100" + "Put(keypart2part3, value)@102", + PrintContents(&batch)); + ASSERT_EQ(3u, batch.Count()); +} + +namespace { +class ColumnFamilyHandleImplDummy : public ColumnFamilyHandleImpl { + public: + explicit ColumnFamilyHandleImplDummy(int id) + : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), id_(id) {} + uint32_t GetID() const override { return id_; } + const Comparator* GetComparator() const override { + return BytewiseComparator(); + } + + private: + uint32_t id_; +}; +} // namespace anonymous + +TEST_F(WriteBatchTest, ColumnFamiliesBatchTest) { + WriteBatch batch; + ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8); + batch.Put(&zero, Slice("foo"), Slice("bar")); + batch.Put(&two, Slice("twofoo"), Slice("bar2")); + batch.Put(&eight, Slice("eightfoo"), Slice("bar8")); + batch.Delete(&eight, Slice("eightfoo")); + batch.SingleDelete(&two, Slice("twofoo")); + batch.DeleteRange(&two, Slice("3foo"), Slice("4foo")); + batch.Merge(&three, Slice("threethree"), Slice("3three")); + batch.Put(&zero, Slice("foo"), Slice("bar")); + batch.Merge(Slice("omom"), Slice("nom")); + + TestHandler handler; + batch.Iterate(&handler); + ASSERT_EQ( + "Put(foo, bar)" + "PutCF(2, twofoo, bar2)" + "PutCF(8, eightfoo, bar8)" + "DeleteCF(8, eightfoo)" + "SingleDeleteCF(2, twofoo)" + "DeleteRangeCF(2, 3foo, 4foo)" + "MergeCF(3, threethree, 3three)" + "Put(foo, bar)" + "Merge(omom, nom)", + handler.seen); +} + +#ifndef ROCKSDB_LITE +TEST_F(WriteBatchTest, ColumnFamiliesBatchWithIndexTest) { + WriteBatchWithIndex batch; + ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8); + batch.Put(&zero, Slice("foo"), Slice("bar")); + batch.Put(&two, Slice("twofoo"), Slice("bar2")); + batch.Put(&eight, Slice("eightfoo"), Slice("bar8")); + batch.Delete(&eight, Slice("eightfoo")); + batch.SingleDelete(&two, Slice("twofoo")); + batch.Merge(&three, Slice("threethree"), Slice("3three")); + batch.Put(&zero, Slice("foo"), Slice("bar")); + batch.Merge(Slice("omom"), Slice("nom")); + + std::unique_ptr iter; + + iter.reset(batch.NewIterator(&eight)); + iter->Seek("eightfoo"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type); + ASSERT_EQ("eightfoo", iter->Entry().key.ToString()); + ASSERT_EQ("bar8", iter->Entry().value.ToString()); + + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(WriteType::kDeleteRecord, iter->Entry().type); + ASSERT_EQ("eightfoo", iter->Entry().key.ToString()); + + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter.reset(batch.NewIterator(&two)); + iter->Seek("twofoo"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type); + ASSERT_EQ("twofoo", iter->Entry().key.ToString()); + ASSERT_EQ("bar2", iter->Entry().value.ToString()); + + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(WriteType::kSingleDeleteRecord, iter->Entry().type); + ASSERT_EQ("twofoo", iter->Entry().key.ToString()); + + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter.reset(batch.NewIterator()); + iter->Seek("gggg"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(WriteType::kMergeRecord, iter->Entry().type); + ASSERT_EQ("omom", iter->Entry().key.ToString()); + ASSERT_EQ("nom", iter->Entry().value.ToString()); + + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter.reset(batch.NewIterator(&zero)); + iter->Seek("foo"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type); + ASSERT_EQ("foo", iter->Entry().key.ToString()); + ASSERT_EQ("bar", iter->Entry().value.ToString()); + + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type); + ASSERT_EQ("foo", iter->Entry().key.ToString()); + ASSERT_EQ("bar", iter->Entry().value.ToString()); + + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(WriteType::kMergeRecord, iter->Entry().type); + ASSERT_EQ("omom", iter->Entry().key.ToString()); + ASSERT_EQ("nom", iter->Entry().value.ToString()); + + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + TestHandler handler; + batch.GetWriteBatch()->Iterate(&handler); + ASSERT_EQ( + "Put(foo, bar)" + "PutCF(2, twofoo, bar2)" + "PutCF(8, eightfoo, bar8)" + "DeleteCF(8, eightfoo)" + "SingleDeleteCF(2, twofoo)" + "MergeCF(3, threethree, 3three)" + "Put(foo, bar)" + "Merge(omom, nom)", + handler.seen); +} +#endif // !ROCKSDB_LITE + +TEST_F(WriteBatchTest, SavePointTest) { + Status s; + WriteBatch batch; + batch.SetSavePoint(); + + batch.Put("A", "a"); + batch.Put("B", "b"); + batch.SetSavePoint(); + + batch.Put("C", "c"); + batch.Delete("A"); + batch.SetSavePoint(); + batch.SetSavePoint(); + + ASSERT_OK(batch.RollbackToSavePoint()); + ASSERT_EQ( + "Delete(A)@3" + "Put(A, a)@0" + "Put(B, b)@1" + "Put(C, c)@2", + PrintContents(&batch)); + + ASSERT_OK(batch.RollbackToSavePoint()); + ASSERT_OK(batch.RollbackToSavePoint()); + ASSERT_EQ( + "Put(A, a)@0" + "Put(B, b)@1", + PrintContents(&batch)); + + batch.Delete("A"); + batch.Put("B", "bb"); + + ASSERT_OK(batch.RollbackToSavePoint()); + ASSERT_EQ("", PrintContents(&batch)); + + s = batch.RollbackToSavePoint(); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_EQ("", PrintContents(&batch)); + + batch.Put("D", "d"); + batch.Delete("A"); + + batch.SetSavePoint(); + + batch.Put("A", "aaa"); + + ASSERT_OK(batch.RollbackToSavePoint()); + ASSERT_EQ( + "Delete(A)@1" + "Put(D, d)@0", + PrintContents(&batch)); + + batch.SetSavePoint(); + + batch.Put("D", "d"); + batch.Delete("A"); + + ASSERT_OK(batch.RollbackToSavePoint()); + ASSERT_EQ( + "Delete(A)@1" + "Put(D, d)@0", + PrintContents(&batch)); + + s = batch.RollbackToSavePoint(); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_EQ( + "Delete(A)@1" + "Put(D, d)@0", + PrintContents(&batch)); + + WriteBatch batch2; + + s = batch2.RollbackToSavePoint(); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_EQ("", PrintContents(&batch2)); + + batch2.Delete("A"); + batch2.SetSavePoint(); + + s = batch2.RollbackToSavePoint(); + ASSERT_OK(s); + ASSERT_EQ("Delete(A)@0", PrintContents(&batch2)); + + batch2.Clear(); + ASSERT_EQ("", PrintContents(&batch2)); + + batch2.SetSavePoint(); + + batch2.Delete("B"); + ASSERT_EQ("Delete(B)@0", PrintContents(&batch2)); + + batch2.SetSavePoint(); + s = batch2.RollbackToSavePoint(); + ASSERT_OK(s); + ASSERT_EQ("Delete(B)@0", PrintContents(&batch2)); + + s = batch2.RollbackToSavePoint(); + ASSERT_OK(s); + ASSERT_EQ("", PrintContents(&batch2)); + + s = batch2.RollbackToSavePoint(); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_EQ("", PrintContents(&batch2)); + + WriteBatch batch3; + + s = batch3.PopSavePoint(); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_EQ("", PrintContents(&batch3)); + + batch3.SetSavePoint(); + batch3.Delete("A"); + + s = batch3.PopSavePoint(); + ASSERT_OK(s); + ASSERT_EQ("Delete(A)@0", PrintContents(&batch3)); +} + +TEST_F(WriteBatchTest, MemoryLimitTest) { + Status s; + // The header size is 12 bytes. The two Puts take 8 bytes which gives total + // of 12 + 8 * 2 = 28 bytes. + WriteBatch batch(0, 28); + + ASSERT_OK(batch.Put("a", "....")); + ASSERT_OK(batch.Put("b", "....")); + s = batch.Put("c", "...."); + ASSERT_TRUE(s.IsMemoryLimit()); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/write_callback.h b/src/rocksdb/db/write_callback.h new file mode 100644 index 000000000..106d02041 --- /dev/null +++ b/src/rocksdb/db/write_callback.h @@ -0,0 +1,27 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class DB; + +class WriteCallback { + public: + virtual ~WriteCallback() {} + + // Will be called while on the write thread before the write executes. If + // this function returns a non-OK status, the write will be aborted and this + // status will be returned to the caller of DB::Write(). + virtual Status Callback(DB* db) = 0; + + // return true if writes with this callback can be batched with other writes + virtual bool AllowWriteBatching() = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/write_callback_test.cc b/src/rocksdb/db/write_callback_test.cc new file mode 100644 index 000000000..df7d673aa --- /dev/null +++ b/src/rocksdb/db/write_callback_test.cc @@ -0,0 +1,452 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "db/write_callback.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/write_batch.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "util/random.h" + +using std::string; + +namespace ROCKSDB_NAMESPACE { + +class WriteCallbackTest : public testing::Test { + public: + string dbname; + + WriteCallbackTest() { + dbname = test::PerThreadDBPath("write_callback_testdb"); + } +}; + +class WriteCallbackTestWriteCallback1 : public WriteCallback { + public: + bool was_called = false; + + Status Callback(DB *db) override { + was_called = true; + + // Make sure db is a DBImpl + DBImpl* db_impl = dynamic_cast (db); + if (db_impl == nullptr) { + return Status::InvalidArgument(""); + } + + return Status::OK(); + } + + bool AllowWriteBatching() override { return true; } +}; + +class WriteCallbackTestWriteCallback2 : public WriteCallback { + public: + Status Callback(DB* /*db*/) override { return Status::Busy(); } + bool AllowWriteBatching() override { return true; } +}; + +class MockWriteCallback : public WriteCallback { + public: + bool should_fail_ = false; + bool allow_batching_ = false; + std::atomic was_called_{false}; + + MockWriteCallback() {} + + MockWriteCallback(const MockWriteCallback& other) { + should_fail_ = other.should_fail_; + allow_batching_ = other.allow_batching_; + was_called_.store(other.was_called_.load()); + } + + Status Callback(DB* /*db*/) override { + was_called_.store(true); + if (should_fail_) { + return Status::Busy(); + } else { + return Status::OK(); + } + } + + bool AllowWriteBatching() override { return allow_batching_; } +}; + +TEST_F(WriteCallbackTest, WriteWithCallbackTest) { + struct WriteOP { + WriteOP(bool should_fail = false) { callback_.should_fail_ = should_fail; } + + void Put(const string& key, const string& val) { + kvs_.push_back(std::make_pair(key, val)); + write_batch_.Put(key, val); + } + + void Clear() { + kvs_.clear(); + write_batch_.Clear(); + callback_.was_called_.store(false); + } + + MockWriteCallback callback_; + WriteBatch write_batch_; + std::vector> kvs_; + }; + + // In each scenario we'll launch multiple threads to write. + // The size of each array equals to number of threads, and + // each boolean in it denote whether callback of corresponding + // thread should succeed or fail. + std::vector> write_scenarios = { + {true}, + {false}, + {false, false}, + {true, true}, + {true, false}, + {false, true}, + {false, false, false}, + {true, true, true}, + {false, true, false}, + {true, false, true}, + {true, false, false, false, false}, + {false, false, false, false, true}, + {false, false, true, false, true}, + }; + + for (auto& unordered_write : {true, false}) { + for (auto& seq_per_batch : {true, false}) { + for (auto& two_queues : {true, false}) { + for (auto& allow_parallel : {true, false}) { + for (auto& allow_batching : {true, false}) { + for (auto& enable_WAL : {true, false}) { + for (auto& enable_pipelined_write : {true, false}) { + for (auto& write_group : write_scenarios) { + Options options; + options.create_if_missing = true; + options.unordered_write = unordered_write; + options.allow_concurrent_memtable_write = allow_parallel; + options.enable_pipelined_write = enable_pipelined_write; + options.two_write_queues = two_queues; + // Skip unsupported combinations + if (options.enable_pipelined_write && seq_per_batch) { + continue; + } + if (options.enable_pipelined_write && options.two_write_queues) { + continue; + } + if (options.unordered_write && + !options.allow_concurrent_memtable_write) { + continue; + } + if (options.unordered_write && options.enable_pipelined_write) { + continue; + } + + ReadOptions read_options; + DB* db; + DBImpl* db_impl; + + DestroyDB(dbname, options); + + DBOptions db_options(options); + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + std::vector handles; + auto open_s = + DBImpl::Open(db_options, dbname, column_families, &handles, + &db, seq_per_batch, true /* batch_per_txn */); + ASSERT_OK(open_s); + assert(handles.size() == 1); + delete handles[0]; + + db_impl = dynamic_cast(db); + ASSERT_TRUE(db_impl); + + // Writers that have called JoinBatchGroup. + std::atomic threads_joining(0); + // Writers that have linked to the queue + std::atomic threads_linked(0); + // Writers that pass WriteThread::JoinBatchGroup:Wait sync-point. + std::atomic threads_verified(0); + + std::atomic seq(db_impl->GetLatestSequenceNumber()); + ASSERT_EQ(db_impl->GetLatestSequenceNumber(), 0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::JoinBatchGroup:Start", [&](void*) { + uint64_t cur_threads_joining = threads_joining.fetch_add(1); + // Wait for the last joined writer to link to the queue. + // In this way the writers link to the queue one by one. + // This allows us to confidently detect the first writer + // who increases threads_linked as the leader. + while (threads_linked.load() < cur_threads_joining) { + } + }); + + // Verification once writers call JoinBatchGroup. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::JoinBatchGroup:Wait", [&](void* arg) { + uint64_t cur_threads_linked = threads_linked.fetch_add(1); + bool is_leader = false; + bool is_last = false; + + // who am i + is_leader = (cur_threads_linked == 0); + is_last = (cur_threads_linked == write_group.size() - 1); + + // check my state + auto* writer = reinterpret_cast(arg); + + if (is_leader) { + ASSERT_TRUE(writer->state == + WriteThread::State::STATE_GROUP_LEADER); + } else { + ASSERT_TRUE(writer->state == + WriteThread::State::STATE_INIT); + } + + // (meta test) the first WriteOP should indeed be the first + // and the last should be the last (all others can be out of + // order) + if (is_leader) { + ASSERT_TRUE(writer->callback->Callback(nullptr).ok() == + !write_group.front().callback_.should_fail_); + } else if (is_last) { + ASSERT_TRUE(writer->callback->Callback(nullptr).ok() == + !write_group.back().callback_.should_fail_); + } + + threads_verified.fetch_add(1); + // Wait here until all verification in this sync-point + // callback finish for all writers. + while (threads_verified.load() < write_group.size()) { + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::JoinBatchGroup:DoneWaiting", [&](void* arg) { + // check my state + auto* writer = reinterpret_cast(arg); + + if (!allow_batching) { + // no batching so everyone should be a leader + ASSERT_TRUE(writer->state == + WriteThread::State::STATE_GROUP_LEADER); + } else if (!allow_parallel) { + ASSERT_TRUE(writer->state == + WriteThread::State::STATE_COMPLETED || + (enable_pipelined_write && + writer->state == + WriteThread::State:: + STATE_MEMTABLE_WRITER_LEADER)); + } + }); + + std::atomic thread_num(0); + std::atomic dummy_key(0); + + // Each write thread create a random write batch and write to DB + // with a write callback. + std::function write_with_callback_func = [&]() { + uint32_t i = thread_num.fetch_add(1); + Random rnd(i); + + // leaders gotta lead + while (i > 0 && threads_verified.load() < 1) { + } + + // loser has to lose + while (i == write_group.size() - 1 && + threads_verified.load() < write_group.size() - 1) { + } + + auto& write_op = write_group.at(i); + write_op.Clear(); + write_op.callback_.allow_batching_ = allow_batching; + + // insert some keys + for (uint32_t j = 0; j < rnd.Next() % 50; j++) { + // grab unique key + char my_key = dummy_key.fetch_add(1); + + string skey(5, my_key); + string sval(10, my_key); + write_op.Put(skey, sval); + + if (!write_op.callback_.should_fail_ && !seq_per_batch) { + seq.fetch_add(1); + } + } + if (!write_op.callback_.should_fail_ && seq_per_batch) { + seq.fetch_add(1); + } + + WriteOptions woptions; + woptions.disableWAL = !enable_WAL; + woptions.sync = enable_WAL; + Status s; + if (seq_per_batch) { + class PublishSeqCallback : public PreReleaseCallback { + public: + PublishSeqCallback(DBImpl* db_impl_in) + : db_impl_(db_impl_in) {} + Status Callback(SequenceNumber last_seq, bool /*not used*/, + uint64_t, size_t /*index*/, + size_t /*total*/) override { + db_impl_->SetLastPublishedSequence(last_seq); + return Status::OK(); + } + DBImpl* db_impl_; + } publish_seq_callback(db_impl); + // seq_per_batch requires a natural batch separator or Noop + WriteBatchInternal::InsertNoop(&write_op.write_batch_); + const size_t ONE_BATCH = 1; + s = db_impl->WriteImpl( + woptions, &write_op.write_batch_, &write_op.callback_, + nullptr, 0, false, nullptr, ONE_BATCH, + two_queues ? &publish_seq_callback : nullptr); + } else { + s = db_impl->WriteWithCallback( + woptions, &write_op.write_batch_, &write_op.callback_); + } + + if (write_op.callback_.should_fail_) { + ASSERT_TRUE(s.IsBusy()); + } else { + ASSERT_OK(s); + } + }; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // do all the writes + std::vector threads; + for (uint32_t i = 0; i < write_group.size(); i++) { + threads.emplace_back(write_with_callback_func); + } + for (auto& t : threads) { + t.join(); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + // check for keys + string value; + for (auto& w : write_group) { + ASSERT_TRUE(w.callback_.was_called_.load()); + for (auto& kvp : w.kvs_) { + if (w.callback_.should_fail_) { + ASSERT_TRUE( + db->Get(read_options, kvp.first, &value).IsNotFound()); + } else { + ASSERT_OK(db->Get(read_options, kvp.first, &value)); + ASSERT_EQ(value, kvp.second); + } + } + } + + ASSERT_EQ(seq.load(), db_impl->TEST_GetLastVisibleSequence()); + + delete db; + DestroyDB(dbname, options); + } + } + } + } + } + } + } + } +} + +TEST_F(WriteCallbackTest, WriteCallBackTest) { + Options options; + WriteOptions write_options; + ReadOptions read_options; + string value; + DB* db; + DBImpl* db_impl; + + DestroyDB(dbname, options); + + options.create_if_missing = true; + Status s = DB::Open(options, dbname, &db); + ASSERT_OK(s); + + db_impl = dynamic_cast (db); + ASSERT_TRUE(db_impl); + + WriteBatch wb; + + wb.Put("a", "value.a"); + wb.Delete("x"); + + // Test a simple Write + s = db->Write(write_options, &wb); + ASSERT_OK(s); + + s = db->Get(read_options, "a", &value); + ASSERT_OK(s); + ASSERT_EQ("value.a", value); + + // Test WriteWithCallback + WriteCallbackTestWriteCallback1 callback1; + WriteBatch wb2; + + wb2.Put("a", "value.a2"); + + s = db_impl->WriteWithCallback(write_options, &wb2, &callback1); + ASSERT_OK(s); + ASSERT_TRUE(callback1.was_called); + + s = db->Get(read_options, "a", &value); + ASSERT_OK(s); + ASSERT_EQ("value.a2", value); + + // Test WriteWithCallback for a callback that fails + WriteCallbackTestWriteCallback2 callback2; + WriteBatch wb3; + + wb3.Put("a", "value.a3"); + + s = db_impl->WriteWithCallback(write_options, &wb3, &callback2); + ASSERT_NOK(s); + + s = db->Get(read_options, "a", &value); + ASSERT_OK(s); + ASSERT_EQ("value.a2", value); + + delete db; + DestroyDB(dbname, options); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, + "SKIPPED as WriteWithCallback is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/write_controller.cc b/src/rocksdb/db/write_controller.cc new file mode 100644 index 000000000..5480aabd1 --- /dev/null +++ b/src/rocksdb/db/write_controller.cc @@ -0,0 +1,128 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/write_controller.h" + +#include +#include +#include +#include "rocksdb/env.h" + +namespace ROCKSDB_NAMESPACE { + +std::unique_ptr WriteController::GetStopToken() { + ++total_stopped_; + return std::unique_ptr(new StopWriteToken(this)); +} + +std::unique_ptr WriteController::GetDelayToken( + uint64_t write_rate) { + total_delayed_++; + // Reset counters. + last_refill_time_ = 0; + bytes_left_ = 0; + set_delayed_write_rate(write_rate); + return std::unique_ptr(new DelayWriteToken(this)); +} + +std::unique_ptr +WriteController::GetCompactionPressureToken() { + ++total_compaction_pressure_; + return std::unique_ptr( + new CompactionPressureToken(this)); +} + +bool WriteController::IsStopped() const { + return total_stopped_.load(std::memory_order_relaxed) > 0; +} +// This is inside DB mutex, so we can't sleep and need to minimize +// frequency to get time. +// If it turns out to be a performance issue, we can redesign the thread +// synchronization model here. +// The function trust caller will sleep micros returned. +uint64_t WriteController::GetDelay(Env* env, uint64_t num_bytes) { + if (total_stopped_.load(std::memory_order_relaxed) > 0) { + return 0; + } + if (total_delayed_.load(std::memory_order_relaxed) == 0) { + return 0; + } + + const uint64_t kMicrosPerSecond = 1000000; + const uint64_t kRefillInterval = 1024U; + + if (bytes_left_ >= num_bytes) { + bytes_left_ -= num_bytes; + return 0; + } + // The frequency to get time inside DB mutex is less than one per refill + // interval. + auto time_now = NowMicrosMonotonic(env); + + uint64_t sleep_debt = 0; + uint64_t time_since_last_refill = 0; + if (last_refill_time_ != 0) { + if (last_refill_time_ > time_now) { + sleep_debt = last_refill_time_ - time_now; + } else { + time_since_last_refill = time_now - last_refill_time_; + bytes_left_ += + static_cast(static_cast(time_since_last_refill) / + kMicrosPerSecond * delayed_write_rate_); + if (time_since_last_refill >= kRefillInterval && + bytes_left_ > num_bytes) { + // If refill interval already passed and we have enough bytes + // return without extra sleeping. + last_refill_time_ = time_now; + bytes_left_ -= num_bytes; + return 0; + } + } + } + + uint64_t single_refill_amount = + delayed_write_rate_ * kRefillInterval / kMicrosPerSecond; + if (bytes_left_ + single_refill_amount >= num_bytes) { + // Wait until a refill interval + // Never trigger expire for less than one refill interval to avoid to get + // time. + bytes_left_ = bytes_left_ + single_refill_amount - num_bytes; + last_refill_time_ = time_now + kRefillInterval; + return kRefillInterval + sleep_debt; + } + + // Need to refill more than one interval. Need to sleep longer. Check + // whether expiration will hit + + // Sleep just until `num_bytes` is allowed. + uint64_t sleep_amount = + static_cast(num_bytes / + static_cast(delayed_write_rate_) * + kMicrosPerSecond) + + sleep_debt; + last_refill_time_ = time_now + sleep_amount; + return sleep_amount; +} + +uint64_t WriteController::NowMicrosMonotonic(Env* env) { + return env->NowNanos() / std::milli::den; +} + +StopWriteToken::~StopWriteToken() { + assert(controller_->total_stopped_ >= 1); + --controller_->total_stopped_; +} + +DelayWriteToken::~DelayWriteToken() { + controller_->total_delayed_--; + assert(controller_->total_delayed_.load() >= 0); +} + +CompactionPressureToken::~CompactionPressureToken() { + controller_->total_compaction_pressure_--; + assert(controller_->total_compaction_pressure_ >= 0); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/write_controller.h b/src/rocksdb/db/write_controller.h new file mode 100644 index 000000000..785ae6896 --- /dev/null +++ b/src/rocksdb/db/write_controller.h @@ -0,0 +1,144 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include +#include +#include "rocksdb/rate_limiter.h" + +namespace ROCKSDB_NAMESPACE { + +class Env; +class WriteControllerToken; + +// WriteController is controlling write stalls in our write code-path. Write +// stalls happen when compaction can't keep up with write rate. +// All of the methods here (including WriteControllerToken's destructors) need +// to be called while holding DB mutex +class WriteController { + public: + explicit WriteController(uint64_t _delayed_write_rate = 1024u * 1024u * 32u, + int64_t low_pri_rate_bytes_per_sec = 1024 * 1024) + : total_stopped_(0), + total_delayed_(0), + total_compaction_pressure_(0), + bytes_left_(0), + last_refill_time_(0), + low_pri_rate_limiter_( + NewGenericRateLimiter(low_pri_rate_bytes_per_sec)) { + set_max_delayed_write_rate(_delayed_write_rate); + } + ~WriteController() = default; + + // When an actor (column family) requests a stop token, all writes will be + // stopped until the stop token is released (deleted) + std::unique_ptr GetStopToken(); + // When an actor (column family) requests a delay token, total delay for all + // writes to the DB will be controlled under the delayed write rate. Every + // write needs to call GetDelay() with number of bytes writing to the DB, + // which returns number of microseconds to sleep. + std::unique_ptr GetDelayToken( + uint64_t delayed_write_rate); + // When an actor (column family) requests a moderate token, compaction + // threads will be increased + std::unique_ptr GetCompactionPressureToken(); + + // these three metods are querying the state of the WriteController + bool IsStopped() const; + bool NeedsDelay() const { return total_delayed_.load() > 0; } + bool NeedSpeedupCompaction() const { + return IsStopped() || NeedsDelay() || total_compaction_pressure_ > 0; + } + // return how many microseconds the caller needs to sleep after the call + // num_bytes: how many number of bytes to put into the DB. + // Prerequisite: DB mutex held. + uint64_t GetDelay(Env* env, uint64_t num_bytes); + void set_delayed_write_rate(uint64_t write_rate) { + // avoid divide 0 + if (write_rate == 0) { + write_rate = 1u; + } else if (write_rate > max_delayed_write_rate()) { + write_rate = max_delayed_write_rate(); + } + delayed_write_rate_ = write_rate; + } + + void set_max_delayed_write_rate(uint64_t write_rate) { + // avoid divide 0 + if (write_rate == 0) { + write_rate = 1u; + } + max_delayed_write_rate_ = write_rate; + // update delayed_write_rate_ as well + delayed_write_rate_ = write_rate; + } + + uint64_t delayed_write_rate() const { return delayed_write_rate_; } + + uint64_t max_delayed_write_rate() const { return max_delayed_write_rate_; } + + RateLimiter* low_pri_rate_limiter() { return low_pri_rate_limiter_.get(); } + + private: + uint64_t NowMicrosMonotonic(Env* env); + + friend class WriteControllerToken; + friend class StopWriteToken; + friend class DelayWriteToken; + friend class CompactionPressureToken; + + std::atomic total_stopped_; + std::atomic total_delayed_; + std::atomic total_compaction_pressure_; + uint64_t bytes_left_; + uint64_t last_refill_time_; + // write rate set when initialization or by `DBImpl::SetDBOptions` + uint64_t max_delayed_write_rate_; + // current write rate + uint64_t delayed_write_rate_; + + std::unique_ptr low_pri_rate_limiter_; +}; + +class WriteControllerToken { + public: + explicit WriteControllerToken(WriteController* controller) + : controller_(controller) {} + virtual ~WriteControllerToken() {} + + protected: + WriteController* controller_; + + private: + // no copying allowed + WriteControllerToken(const WriteControllerToken&) = delete; + void operator=(const WriteControllerToken&) = delete; +}; + +class StopWriteToken : public WriteControllerToken { + public: + explicit StopWriteToken(WriteController* controller) + : WriteControllerToken(controller) {} + virtual ~StopWriteToken(); +}; + +class DelayWriteToken : public WriteControllerToken { + public: + explicit DelayWriteToken(WriteController* controller) + : WriteControllerToken(controller) {} + virtual ~DelayWriteToken(); +}; + +class CompactionPressureToken : public WriteControllerToken { + public: + explicit CompactionPressureToken(WriteController* controller) + : WriteControllerToken(controller) {} + virtual ~CompactionPressureToken(); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/write_controller_test.cc b/src/rocksdb/db/write_controller_test.cc new file mode 100644 index 000000000..72d116798 --- /dev/null +++ b/src/rocksdb/db/write_controller_test.cc @@ -0,0 +1,135 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include + +#include "db/write_controller.h" + +#include "rocksdb/env.h" +#include "test_util/testharness.h" + +namespace ROCKSDB_NAMESPACE { + +class WriteControllerTest : public testing::Test {}; + +class TimeSetEnv : public EnvWrapper { + public: + explicit TimeSetEnv() : EnvWrapper(nullptr) {} + uint64_t now_micros_ = 6666; + uint64_t NowNanos() override { return now_micros_ * std::milli::den; } +}; + +TEST_F(WriteControllerTest, ChangeDelayRateTest) { + TimeSetEnv env; + WriteController controller(40000000u); // also set max delayed rate + controller.set_delayed_write_rate(10000000u); + auto delay_token_0 = + controller.GetDelayToken(controller.delayed_write_rate()); + ASSERT_EQ(static_cast(2000000), + controller.GetDelay(&env, 20000000u)); + auto delay_token_1 = controller.GetDelayToken(2000000u); + ASSERT_EQ(static_cast(10000000), + controller.GetDelay(&env, 20000000u)); + auto delay_token_2 = controller.GetDelayToken(1000000u); + ASSERT_EQ(static_cast(20000000), + controller.GetDelay(&env, 20000000u)); + auto delay_token_3 = controller.GetDelayToken(20000000u); + ASSERT_EQ(static_cast(1000000), + controller.GetDelay(&env, 20000000u)); + // This is more than max rate. Max delayed rate will be used. + auto delay_token_4 = + controller.GetDelayToken(controller.delayed_write_rate() * 3); + ASSERT_EQ(static_cast(500000), + controller.GetDelay(&env, 20000000u)); +} + +TEST_F(WriteControllerTest, SanityTest) { + WriteController controller(10000000u); + auto stop_token_1 = controller.GetStopToken(); + auto stop_token_2 = controller.GetStopToken(); + + ASSERT_TRUE(controller.IsStopped()); + stop_token_1.reset(); + ASSERT_TRUE(controller.IsStopped()); + stop_token_2.reset(); + ASSERT_FALSE(controller.IsStopped()); + + TimeSetEnv env; + + auto delay_token_1 = controller.GetDelayToken(10000000u); + ASSERT_EQ(static_cast(2000000), + controller.GetDelay(&env, 20000000u)); + + env.now_micros_ += 1999900u; // sleep debt 1000 + + auto delay_token_2 = controller.GetDelayToken(10000000u); + // Rate reset after changing the token. + ASSERT_EQ(static_cast(2000000), + controller.GetDelay(&env, 20000000u)); + + env.now_micros_ += 1999900u; // sleep debt 1000 + + // One refill: 10240 bytes allowed, 1000 used, 9240 left + ASSERT_EQ(static_cast(1124), controller.GetDelay(&env, 1000u)); + env.now_micros_ += 1124u; // sleep debt 0 + + delay_token_2.reset(); + // 1000 used, 8240 left + ASSERT_EQ(static_cast(0), controller.GetDelay(&env, 1000u)); + + env.now_micros_ += 100u; // sleep credit 100 + // 1000 used, 7240 left + ASSERT_EQ(static_cast(0), controller.GetDelay(&env, 1000u)); + + env.now_micros_ += 100u; // sleep credit 200 + // One refill: 10240 fileed, sleep credit generates 2000. 8000 used + // 7240 + 10240 + 2000 - 8000 = 11480 left + ASSERT_EQ(static_cast(1024u), controller.GetDelay(&env, 8000u)); + + env.now_micros_ += 200u; // sleep debt 824 + // 1000 used, 10480 left. + ASSERT_EQ(static_cast(0), controller.GetDelay(&env, 1000u)); + + env.now_micros_ += 200u; // sleep debt 624 + // Out of bound sleep, still 10480 left + ASSERT_EQ(static_cast(3000624u), + controller.GetDelay(&env, 30000000u)); + + env.now_micros_ += 3000724u; // sleep credit 100 + // 6000 used, 4480 left. + ASSERT_EQ(static_cast(0), controller.GetDelay(&env, 6000u)); + + env.now_micros_ += 200u; // sleep credit 300 + // One refill, credit 4480 balance + 3000 credit + 10240 refill + // Use 8000, 9720 left + ASSERT_EQ(static_cast(1024u), controller.GetDelay(&env, 8000u)); + + env.now_micros_ += 3024u; // sleep credit 2000 + + // 1720 left + ASSERT_EQ(static_cast(0u), controller.GetDelay(&env, 8000u)); + + // 1720 balance + 20000 credit = 20170 left + // Use 8000, 12170 left + ASSERT_EQ(static_cast(0u), controller.GetDelay(&env, 8000u)); + + // 4170 left + ASSERT_EQ(static_cast(0u), controller.GetDelay(&env, 8000u)); + + // Need a refill + ASSERT_EQ(static_cast(1024u), controller.GetDelay(&env, 9000u)); + + delay_token_1.reset(); + ASSERT_EQ(static_cast(0), controller.GetDelay(&env, 30000000u)); + delay_token_1.reset(); + ASSERT_FALSE(controller.IsStopped()); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/write_thread.cc b/src/rocksdb/db/write_thread.cc new file mode 100644 index 000000000..5f50bba63 --- /dev/null +++ b/src/rocksdb/db/write_thread.cc @@ -0,0 +1,777 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/write_thread.h" +#include +#include +#include "db/column_family.h" +#include "monitoring/perf_context_imp.h" +#include "port/port.h" +#include "test_util/sync_point.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +WriteThread::WriteThread(const ImmutableDBOptions& db_options) + : max_yield_usec_(db_options.enable_write_thread_adaptive_yield + ? db_options.write_thread_max_yield_usec + : 0), + slow_yield_usec_(db_options.write_thread_slow_yield_usec), + allow_concurrent_memtable_write_( + db_options.allow_concurrent_memtable_write), + enable_pipelined_write_(db_options.enable_pipelined_write), + max_write_batch_group_size_bytes( + db_options.max_write_batch_group_size_bytes), + newest_writer_(nullptr), + newest_memtable_writer_(nullptr), + last_sequence_(0), + write_stall_dummy_(), + stall_mu_(), + stall_cv_(&stall_mu_) {} + +uint8_t WriteThread::BlockingAwaitState(Writer* w, uint8_t goal_mask) { + // We're going to block. Lazily create the mutex. We guarantee + // propagation of this construction to the waker via the + // STATE_LOCKED_WAITING state. The waker won't try to touch the mutex + // or the condvar unless they CAS away the STATE_LOCKED_WAITING that + // we install below. + w->CreateMutex(); + + auto state = w->state.load(std::memory_order_acquire); + assert(state != STATE_LOCKED_WAITING); + if ((state & goal_mask) == 0 && + w->state.compare_exchange_strong(state, STATE_LOCKED_WAITING)) { + // we have permission (and an obligation) to use StateMutex + std::unique_lock guard(w->StateMutex()); + w->StateCV().wait(guard, [w] { + return w->state.load(std::memory_order_relaxed) != STATE_LOCKED_WAITING; + }); + state = w->state.load(std::memory_order_relaxed); + } + // else tricky. Goal is met or CAS failed. In the latter case the waker + // must have changed the state, and compare_exchange_strong has updated + // our local variable with the new one. At the moment WriteThread never + // waits for a transition across intermediate states, so we know that + // since a state change has occurred the goal must have been met. + assert((state & goal_mask) != 0); + return state; +} + +uint8_t WriteThread::AwaitState(Writer* w, uint8_t goal_mask, + AdaptationContext* ctx) { + uint8_t state = 0; + + // 1. Busy loop using "pause" for 1 micro sec + // 2. Else SOMETIMES busy loop using "yield" for 100 micro sec (default) + // 3. Else blocking wait + + // On a modern Xeon each loop takes about 7 nanoseconds (most of which + // is the effect of the pause instruction), so 200 iterations is a bit + // more than a microsecond. This is long enough that waits longer than + // this can amortize the cost of accessing the clock and yielding. + for (uint32_t tries = 0; tries < 200; ++tries) { + state = w->state.load(std::memory_order_acquire); + if ((state & goal_mask) != 0) { + return state; + } + port::AsmVolatilePause(); + } + + // This is below the fast path, so that the stat is zero when all writes are + // from the same thread. + PERF_TIMER_GUARD(write_thread_wait_nanos); + + // If we're only going to end up waiting a short period of time, + // it can be a lot more efficient to call std::this_thread::yield() + // in a loop than to block in StateMutex(). For reference, on my 4.0 + // SELinux test server with support for syscall auditing enabled, the + // minimum latency between FUTEX_WAKE to returning from FUTEX_WAIT is + // 2.7 usec, and the average is more like 10 usec. That can be a big + // drag on RockDB's single-writer design. Of course, spinning is a + // bad idea if other threads are waiting to run or if we're going to + // wait for a long time. How do we decide? + // + // We break waiting into 3 categories: short-uncontended, + // short-contended, and long. If we had an oracle, then we would always + // spin for short-uncontended, always block for long, and our choice for + // short-contended might depend on whether we were trying to optimize + // RocksDB throughput or avoid being greedy with system resources. + // + // Bucketing into short or long is easy by measuring elapsed time. + // Differentiating short-uncontended from short-contended is a bit + // trickier, but not too bad. We could look for involuntary context + // switches using getrusage(RUSAGE_THREAD, ..), but it's less work + // (portability code and CPU) to just look for yield calls that take + // longer than we expect. sched_yield() doesn't actually result in any + // context switch overhead if there are no other runnable processes + // on the current core, in which case it usually takes less than + // a microsecond. + // + // There are two primary tunables here: the threshold between "short" + // and "long" waits, and the threshold at which we suspect that a yield + // is slow enough to indicate we should probably block. If these + // thresholds are chosen well then CPU-bound workloads that don't + // have more threads than cores will experience few context switches + // (voluntary or involuntary), and the total number of context switches + // (voluntary and involuntary) will not be dramatically larger (maybe + // 2x) than the number of voluntary context switches that occur when + // --max_yield_wait_micros=0. + // + // There's another constant, which is the number of slow yields we will + // tolerate before reversing our previous decision. Solitary slow + // yields are pretty common (low-priority small jobs ready to run), + // so this should be at least 2. We set this conservatively to 3 so + // that we can also immediately schedule a ctx adaptation, rather than + // waiting for the next update_ctx. + + const size_t kMaxSlowYieldsWhileSpinning = 3; + + // Whether the yield approach has any credit in this context. The credit is + // added by yield being succesfull before timing out, and decreased otherwise. + auto& yield_credit = ctx->value; + // Update the yield_credit based on sample runs or right after a hard failure + bool update_ctx = false; + // Should we reinforce the yield credit + bool would_spin_again = false; + // The samling base for updating the yeild credit. The sampling rate would be + // 1/sampling_base. + const int sampling_base = 256; + + if (max_yield_usec_ > 0) { + update_ctx = Random::GetTLSInstance()->OneIn(sampling_base); + + if (update_ctx || yield_credit.load(std::memory_order_relaxed) >= 0) { + // we're updating the adaptation statistics, or spinning has > + // 50% chance of being shorter than max_yield_usec_ and causing no + // involuntary context switches + auto spin_begin = std::chrono::steady_clock::now(); + + // this variable doesn't include the final yield (if any) that + // causes the goal to be met + size_t slow_yield_count = 0; + + auto iter_begin = spin_begin; + while ((iter_begin - spin_begin) <= + std::chrono::microseconds(max_yield_usec_)) { + std::this_thread::yield(); + + state = w->state.load(std::memory_order_acquire); + if ((state & goal_mask) != 0) { + // success + would_spin_again = true; + break; + } + + auto now = std::chrono::steady_clock::now(); + if (now == iter_begin || + now - iter_begin >= std::chrono::microseconds(slow_yield_usec_)) { + // conservatively count it as a slow yield if our clock isn't + // accurate enough to measure the yield duration + ++slow_yield_count; + if (slow_yield_count >= kMaxSlowYieldsWhileSpinning) { + // Not just one ivcsw, but several. Immediately update yield_credit + // and fall back to blocking + update_ctx = true; + break; + } + } + iter_begin = now; + } + } + } + + if ((state & goal_mask) == 0) { + TEST_SYNC_POINT_CALLBACK("WriteThread::AwaitState:BlockingWaiting", w); + state = BlockingAwaitState(w, goal_mask); + } + + if (update_ctx) { + // Since our update is sample based, it is ok if a thread overwrites the + // updates by other threads. Thus the update does not have to be atomic. + auto v = yield_credit.load(std::memory_order_relaxed); + // fixed point exponential decay with decay constant 1/1024, with +1 + // and -1 scaled to avoid overflow for int32_t + // + // On each update the positive credit is decayed by a facor of 1/1024 (i.e., + // 0.1%). If the sampled yield was successful, the credit is also increased + // by X. Setting X=2^17 ensures that the credit never exceeds + // 2^17*2^10=2^27, which is lower than 2^31 the upperbound of int32_t. Same + // logic applies to negative credits. + v = v - (v / 1024) + (would_spin_again ? 1 : -1) * 131072; + yield_credit.store(v, std::memory_order_relaxed); + } + + assert((state & goal_mask) != 0); + return state; +} + +void WriteThread::SetState(Writer* w, uint8_t new_state) { + auto state = w->state.load(std::memory_order_acquire); + if (state == STATE_LOCKED_WAITING || + !w->state.compare_exchange_strong(state, new_state)) { + assert(state == STATE_LOCKED_WAITING); + + std::lock_guard guard(w->StateMutex()); + assert(w->state.load(std::memory_order_relaxed) != new_state); + w->state.store(new_state, std::memory_order_relaxed); + w->StateCV().notify_one(); + } +} + +bool WriteThread::LinkOne(Writer* w, std::atomic* newest_writer) { + assert(newest_writer != nullptr); + assert(w->state == STATE_INIT); + Writer* writers = newest_writer->load(std::memory_order_relaxed); + while (true) { + // If write stall in effect, and w->no_slowdown is not true, + // block here until stall is cleared. If its true, then return + // immediately + if (writers == &write_stall_dummy_) { + if (w->no_slowdown) { + w->status = Status::Incomplete("Write stall"); + SetState(w, STATE_COMPLETED); + return false; + } + // Since no_slowdown is false, wait here to be notified of the write + // stall clearing + { + MutexLock lock(&stall_mu_); + writers = newest_writer->load(std::memory_order_relaxed); + if (writers == &write_stall_dummy_) { + stall_cv_.Wait(); + // Load newest_writers_ again since it may have changed + writers = newest_writer->load(std::memory_order_relaxed); + continue; + } + } + } + w->link_older = writers; + if (newest_writer->compare_exchange_weak(writers, w)) { + return (writers == nullptr); + } + } +} + +bool WriteThread::LinkGroup(WriteGroup& write_group, + std::atomic* newest_writer) { + assert(newest_writer != nullptr); + Writer* leader = write_group.leader; + Writer* last_writer = write_group.last_writer; + Writer* w = last_writer; + while (true) { + // Unset link_newer pointers to make sure when we call + // CreateMissingNewerLinks later it create all missing links. + w->link_newer = nullptr; + w->write_group = nullptr; + if (w == leader) { + break; + } + w = w->link_older; + } + Writer* newest = newest_writer->load(std::memory_order_relaxed); + while (true) { + leader->link_older = newest; + if (newest_writer->compare_exchange_weak(newest, last_writer)) { + return (newest == nullptr); + } + } +} + +void WriteThread::CreateMissingNewerLinks(Writer* head) { + while (true) { + Writer* next = head->link_older; + if (next == nullptr || next->link_newer != nullptr) { + assert(next == nullptr || next->link_newer == head); + break; + } + next->link_newer = head; + head = next; + } +} + +WriteThread::Writer* WriteThread::FindNextLeader(Writer* from, + Writer* boundary) { + assert(from != nullptr && from != boundary); + Writer* current = from; + while (current->link_older != boundary) { + current = current->link_older; + assert(current != nullptr); + } + return current; +} + +void WriteThread::CompleteLeader(WriteGroup& write_group) { + assert(write_group.size > 0); + Writer* leader = write_group.leader; + if (write_group.size == 1) { + write_group.leader = nullptr; + write_group.last_writer = nullptr; + } else { + assert(leader->link_newer != nullptr); + leader->link_newer->link_older = nullptr; + write_group.leader = leader->link_newer; + } + write_group.size -= 1; + SetState(leader, STATE_COMPLETED); +} + +void WriteThread::CompleteFollower(Writer* w, WriteGroup& write_group) { + assert(write_group.size > 1); + assert(w != write_group.leader); + if (w == write_group.last_writer) { + w->link_older->link_newer = nullptr; + write_group.last_writer = w->link_older; + } else { + w->link_older->link_newer = w->link_newer; + w->link_newer->link_older = w->link_older; + } + write_group.size -= 1; + SetState(w, STATE_COMPLETED); +} + +void WriteThread::BeginWriteStall() { + LinkOne(&write_stall_dummy_, &newest_writer_); + + // Walk writer list until w->write_group != nullptr. The current write group + // will not have a mix of slowdown/no_slowdown, so its ok to stop at that + // point + Writer* w = write_stall_dummy_.link_older; + Writer* prev = &write_stall_dummy_; + while (w != nullptr && w->write_group == nullptr) { + if (w->no_slowdown) { + prev->link_older = w->link_older; + w->status = Status::Incomplete("Write stall"); + SetState(w, STATE_COMPLETED); + if (prev->link_older) { + prev->link_older->link_newer = prev; + } + w = prev->link_older; + } else { + prev = w; + w = w->link_older; + } + } +} + +void WriteThread::EndWriteStall() { + MutexLock lock(&stall_mu_); + + // Unlink write_stall_dummy_ from the write queue. This will unblock + // pending write threads to enqueue themselves + assert(newest_writer_.load(std::memory_order_relaxed) == &write_stall_dummy_); + assert(write_stall_dummy_.link_older != nullptr); + write_stall_dummy_.link_older->link_newer = write_stall_dummy_.link_newer; + newest_writer_.exchange(write_stall_dummy_.link_older); + + // Wake up writers + stall_cv_.SignalAll(); +} + +static WriteThread::AdaptationContext jbg_ctx("JoinBatchGroup"); +void WriteThread::JoinBatchGroup(Writer* w) { + TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:Start", w); + assert(w->batch != nullptr); + + bool linked_as_leader = LinkOne(w, &newest_writer_); + + if (linked_as_leader) { + SetState(w, STATE_GROUP_LEADER); + } + + TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:Wait", w); + + if (!linked_as_leader) { + /** + * Wait util: + * 1) An existing leader pick us as the new leader when it finishes + * 2) An existing leader pick us as its follewer and + * 2.1) finishes the memtable writes on our behalf + * 2.2) Or tell us to finish the memtable writes in pralallel + * 3) (pipelined write) An existing leader pick us as its follower and + * finish book-keeping and WAL write for us, enqueue us as pending + * memtable writer, and + * 3.1) we become memtable writer group leader, or + * 3.2) an existing memtable writer group leader tell us to finish memtable + * writes in parallel. + */ + TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:BeganWaiting", w); + AwaitState(w, STATE_GROUP_LEADER | STATE_MEMTABLE_WRITER_LEADER | + STATE_PARALLEL_MEMTABLE_WRITER | STATE_COMPLETED, + &jbg_ctx); + TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:DoneWaiting", w); + } +} + +size_t WriteThread::EnterAsBatchGroupLeader(Writer* leader, + WriteGroup* write_group) { + assert(leader->link_older == nullptr); + assert(leader->batch != nullptr); + assert(write_group != nullptr); + + size_t size = WriteBatchInternal::ByteSize(leader->batch); + + // Allow the group to grow up to a maximum size, but if the + // original write is small, limit the growth so we do not slow + // down the small write too much. + size_t max_size = max_write_batch_group_size_bytes; + const uint64_t min_batch_size_bytes = max_write_batch_group_size_bytes / 8; + if (size <= min_batch_size_bytes) { + max_size = size + min_batch_size_bytes; + } + + leader->write_group = write_group; + write_group->leader = leader; + write_group->last_writer = leader; + write_group->size = 1; + Writer* newest_writer = newest_writer_.load(std::memory_order_acquire); + + // This is safe regardless of any db mutex status of the caller. Previous + // calls to ExitAsGroupLeader either didn't call CreateMissingNewerLinks + // (they emptied the list and then we added ourself as leader) or had to + // explicitly wake us up (the list was non-empty when we added ourself, + // so we have already received our MarkJoined). + CreateMissingNewerLinks(newest_writer); + + // Tricky. Iteration start (leader) is exclusive and finish + // (newest_writer) is inclusive. Iteration goes from old to new. + Writer* w = leader; + while (w != newest_writer) { + w = w->link_newer; + + if (w->sync && !leader->sync) { + // Do not include a sync write into a batch handled by a non-sync write. + break; + } + + if (w->no_slowdown != leader->no_slowdown) { + // Do not mix writes that are ok with delays with the ones that + // request fail on delays. + break; + } + + if (w->disable_wal != leader->disable_wal) { + // Do not mix writes that enable WAL with the ones whose + // WAL disabled. + break; + } + + if (w->batch == nullptr) { + // Do not include those writes with nullptr batch. Those are not writes, + // those are something else. They want to be alone + break; + } + + if (w->callback != nullptr && !w->callback->AllowWriteBatching()) { + // dont batch writes that don't want to be batched + break; + } + + auto batch_size = WriteBatchInternal::ByteSize(w->batch); + if (size + batch_size > max_size) { + // Do not make batch too big + break; + } + + w->write_group = write_group; + size += batch_size; + write_group->last_writer = w; + write_group->size++; + } + TEST_SYNC_POINT_CALLBACK("WriteThread::EnterAsBatchGroupLeader:End", w); + return size; +} + +void WriteThread::EnterAsMemTableWriter(Writer* leader, + WriteGroup* write_group) { + assert(leader != nullptr); + assert(leader->link_older == nullptr); + assert(leader->batch != nullptr); + assert(write_group != nullptr); + + size_t size = WriteBatchInternal::ByteSize(leader->batch); + + // Allow the group to grow up to a maximum size, but if the + // original write is small, limit the growth so we do not slow + // down the small write too much. + size_t max_size = max_write_batch_group_size_bytes; + const uint64_t min_batch_size_bytes = max_write_batch_group_size_bytes / 8; + if (size <= min_batch_size_bytes) { + max_size = size + min_batch_size_bytes; + } + + leader->write_group = write_group; + write_group->leader = leader; + write_group->size = 1; + Writer* last_writer = leader; + + if (!allow_concurrent_memtable_write_ || !leader->batch->HasMerge()) { + Writer* newest_writer = newest_memtable_writer_.load(); + CreateMissingNewerLinks(newest_writer); + + Writer* w = leader; + while (w != newest_writer) { + w = w->link_newer; + + if (w->batch == nullptr) { + break; + } + + if (w->batch->HasMerge()) { + break; + } + + if (!allow_concurrent_memtable_write_) { + auto batch_size = WriteBatchInternal::ByteSize(w->batch); + if (size + batch_size > max_size) { + // Do not make batch too big + break; + } + size += batch_size; + } + + w->write_group = write_group; + last_writer = w; + write_group->size++; + } + } + + write_group->last_writer = last_writer; + write_group->last_sequence = + last_writer->sequence + WriteBatchInternal::Count(last_writer->batch) - 1; +} + +void WriteThread::ExitAsMemTableWriter(Writer* /*self*/, + WriteGroup& write_group) { + Writer* leader = write_group.leader; + Writer* last_writer = write_group.last_writer; + + Writer* newest_writer = last_writer; + if (!newest_memtable_writer_.compare_exchange_strong(newest_writer, + nullptr)) { + CreateMissingNewerLinks(newest_writer); + Writer* next_leader = last_writer->link_newer; + assert(next_leader != nullptr); + next_leader->link_older = nullptr; + SetState(next_leader, STATE_MEMTABLE_WRITER_LEADER); + } + Writer* w = leader; + while (true) { + if (!write_group.status.ok()) { + w->status = write_group.status; + } + Writer* next = w->link_newer; + if (w != leader) { + SetState(w, STATE_COMPLETED); + } + if (w == last_writer) { + break; + } + w = next; + } + // Note that leader has to exit last, since it owns the write group. + SetState(leader, STATE_COMPLETED); +} + +void WriteThread::LaunchParallelMemTableWriters(WriteGroup* write_group) { + assert(write_group != nullptr); + write_group->running.store(write_group->size); + for (auto w : *write_group) { + SetState(w, STATE_PARALLEL_MEMTABLE_WRITER); + } +} + +static WriteThread::AdaptationContext cpmtw_ctx("CompleteParallelMemTableWriter"); +// This method is called by both the leader and parallel followers +bool WriteThread::CompleteParallelMemTableWriter(Writer* w) { + + auto* write_group = w->write_group; + if (!w->status.ok()) { + std::lock_guard guard(write_group->leader->StateMutex()); + write_group->status = w->status; + } + + if (write_group->running-- > 1) { + // we're not the last one + AwaitState(w, STATE_COMPLETED, &cpmtw_ctx); + return false; + } + // else we're the last parallel worker and should perform exit duties. + w->status = write_group->status; + return true; +} + +void WriteThread::ExitAsBatchGroupFollower(Writer* w) { + auto* write_group = w->write_group; + + assert(w->state == STATE_PARALLEL_MEMTABLE_WRITER); + assert(write_group->status.ok()); + ExitAsBatchGroupLeader(*write_group, write_group->status); + assert(w->status.ok()); + assert(w->state == STATE_COMPLETED); + SetState(write_group->leader, STATE_COMPLETED); +} + +static WriteThread::AdaptationContext eabgl_ctx("ExitAsBatchGroupLeader"); +void WriteThread::ExitAsBatchGroupLeader(WriteGroup& write_group, + Status status) { + Writer* leader = write_group.leader; + Writer* last_writer = write_group.last_writer; + assert(leader->link_older == nullptr); + + // Propagate memtable write error to the whole group. + if (status.ok() && !write_group.status.ok()) { + status = write_group.status; + } + + if (enable_pipelined_write_) { + // Notify writers don't write to memtable to exit. + for (Writer* w = last_writer; w != leader;) { + Writer* next = w->link_older; + w->status = status; + if (!w->ShouldWriteToMemtable()) { + CompleteFollower(w, write_group); + } + w = next; + } + if (!leader->ShouldWriteToMemtable()) { + CompleteLeader(write_group); + } + + Writer* next_leader = nullptr; + + // Look for next leader before we call LinkGroup. If there isn't + // pending writers, place a dummy writer at the tail of the queue + // so we know the boundary of the current write group. + Writer dummy; + Writer* expected = last_writer; + bool has_dummy = newest_writer_.compare_exchange_strong(expected, &dummy); + if (!has_dummy) { + // We find at least one pending writer when we insert dummy. We search + // for next leader from there. + next_leader = FindNextLeader(expected, last_writer); + assert(next_leader != nullptr && next_leader != last_writer); + } + + // Link the ramaining of the group to memtable writer list. + // + // We have to link our group to memtable writer queue before wake up the + // next leader or set newest_writer_ to null, otherwise the next leader + // can run ahead of us and link to memtable writer queue before we do. + if (write_group.size > 0) { + if (LinkGroup(write_group, &newest_memtable_writer_)) { + // The leader can now be different from current writer. + SetState(write_group.leader, STATE_MEMTABLE_WRITER_LEADER); + } + } + + // If we have inserted dummy in the queue, remove it now and check if there + // are pending writer join the queue since we insert the dummy. If so, + // look for next leader again. + if (has_dummy) { + assert(next_leader == nullptr); + expected = &dummy; + bool has_pending_writer = + !newest_writer_.compare_exchange_strong(expected, nullptr); + if (has_pending_writer) { + next_leader = FindNextLeader(expected, &dummy); + assert(next_leader != nullptr && next_leader != &dummy); + } + } + + if (next_leader != nullptr) { + next_leader->link_older = nullptr; + SetState(next_leader, STATE_GROUP_LEADER); + } + AwaitState(leader, STATE_MEMTABLE_WRITER_LEADER | + STATE_PARALLEL_MEMTABLE_WRITER | STATE_COMPLETED, + &eabgl_ctx); + } else { + Writer* head = newest_writer_.load(std::memory_order_acquire); + if (head != last_writer || + !newest_writer_.compare_exchange_strong(head, nullptr)) { + // Either w wasn't the head during the load(), or it was the head + // during the load() but somebody else pushed onto the list before + // we did the compare_exchange_strong (causing it to fail). In the + // latter case compare_exchange_strong has the effect of re-reading + // its first param (head). No need to retry a failing CAS, because + // only a departing leader (which we are at the moment) can remove + // nodes from the list. + assert(head != last_writer); + + // After walking link_older starting from head (if not already done) + // we will be able to traverse w->link_newer below. This function + // can only be called from an active leader, only a leader can + // clear newest_writer_, we didn't, and only a clear newest_writer_ + // could cause the next leader to start their work without a call + // to MarkJoined, so we can definitely conclude that no other leader + // work is going on here (with or without db mutex). + CreateMissingNewerLinks(head); + assert(last_writer->link_newer->link_older == last_writer); + last_writer->link_newer->link_older = nullptr; + + // Next leader didn't self-identify, because newest_writer_ wasn't + // nullptr when they enqueued (we were definitely enqueued before them + // and are still in the list). That means leader handoff occurs when + // we call MarkJoined + SetState(last_writer->link_newer, STATE_GROUP_LEADER); + } + // else nobody else was waiting, although there might already be a new + // leader now + + while (last_writer != leader) { + last_writer->status = status; + // we need to read link_older before calling SetState, because as soon + // as it is marked committed the other thread's Await may return and + // deallocate the Writer. + auto next = last_writer->link_older; + SetState(last_writer, STATE_COMPLETED); + + last_writer = next; + } + } +} + +static WriteThread::AdaptationContext eu_ctx("EnterUnbatched"); +void WriteThread::EnterUnbatched(Writer* w, InstrumentedMutex* mu) { + assert(w != nullptr && w->batch == nullptr); + mu->Unlock(); + bool linked_as_leader = LinkOne(w, &newest_writer_); + if (!linked_as_leader) { + TEST_SYNC_POINT("WriteThread::EnterUnbatched:Wait"); + // Last leader will not pick us as a follower since our batch is nullptr + AwaitState(w, STATE_GROUP_LEADER, &eu_ctx); + } + if (enable_pipelined_write_) { + WaitForMemTableWriters(); + } + mu->Lock(); +} + +void WriteThread::ExitUnbatched(Writer* w) { + assert(w != nullptr); + Writer* newest_writer = w; + if (!newest_writer_.compare_exchange_strong(newest_writer, nullptr)) { + CreateMissingNewerLinks(newest_writer); + Writer* next_leader = w->link_newer; + assert(next_leader != nullptr); + next_leader->link_older = nullptr; + SetState(next_leader, STATE_GROUP_LEADER); + } +} + +static WriteThread::AdaptationContext wfmw_ctx("WaitForMemTableWriters"); +void WriteThread::WaitForMemTableWriters() { + assert(enable_pipelined_write_); + if (newest_memtable_writer_.load() == nullptr) { + return; + } + Writer w; + if (!LinkOne(&w, &newest_memtable_writer_)) { + AwaitState(&w, STATE_MEMTABLE_WRITER_LEADER, &wfmw_ctx); + } + newest_memtable_writer_.store(nullptr); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/write_thread.h b/src/rocksdb/db/write_thread.h new file mode 100644 index 000000000..878199714 --- /dev/null +++ b/src/rocksdb/db/write_thread.h @@ -0,0 +1,431 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "db/pre_release_callback.h" +#include "db/write_callback.h" +#include "monitoring/instrumented_mutex.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" +#include "rocksdb/write_batch.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +class WriteThread { + public: + enum State : uint8_t { + // The initial state of a writer. This is a Writer that is + // waiting in JoinBatchGroup. This state can be left when another + // thread informs the waiter that it has become a group leader + // (-> STATE_GROUP_LEADER), when a leader that has chosen to be + // non-parallel informs a follower that its writes have been committed + // (-> STATE_COMPLETED), or when a leader that has chosen to perform + // updates in parallel and needs this Writer to apply its batch (-> + // STATE_PARALLEL_FOLLOWER). + STATE_INIT = 1, + + // The state used to inform a waiting Writer that it has become the + // leader, and it should now build a write batch group. Tricky: + // this state is not used if newest_writer_ is empty when a writer + // enqueues itself, because there is no need to wait (or even to + // create the mutex and condvar used to wait) in that case. This is + // a terminal state unless the leader chooses to make this a parallel + // batch, in which case the last parallel worker to finish will move + // the leader to STATE_COMPLETED. + STATE_GROUP_LEADER = 2, + + // The state used to inform a waiting writer that it has become the + // leader of memtable writer group. The leader will either write + // memtable for the whole group, or launch a parallel group write + // to memtable by calling LaunchParallelMemTableWrite. + STATE_MEMTABLE_WRITER_LEADER = 4, + + // The state used to inform a waiting writer that it has become a + // parallel memtable writer. It can be the group leader who launch the + // parallel writer group, or one of the followers. The writer should then + // apply its batch to the memtable concurrently and call + // CompleteParallelMemTableWriter. + STATE_PARALLEL_MEMTABLE_WRITER = 8, + + // A follower whose writes have been applied, or a parallel leader + // whose followers have all finished their work. This is a terminal + // state. + STATE_COMPLETED = 16, + + // A state indicating that the thread may be waiting using StateMutex() + // and StateCondVar() + STATE_LOCKED_WAITING = 32, + }; + + struct Writer; + + struct WriteGroup { + Writer* leader = nullptr; + Writer* last_writer = nullptr; + SequenceNumber last_sequence; + // before running goes to zero, status needs leader->StateMutex() + Status status; + std::atomic running; + size_t size = 0; + + struct Iterator { + Writer* writer; + Writer* last_writer; + + explicit Iterator(Writer* w, Writer* last) + : writer(w), last_writer(last) {} + + Writer* operator*() const { return writer; } + + Iterator& operator++() { + assert(writer != nullptr); + if (writer == last_writer) { + writer = nullptr; + } else { + writer = writer->link_newer; + } + return *this; + } + + bool operator!=(const Iterator& other) const { + return writer != other.writer; + } + }; + + Iterator begin() const { return Iterator(leader, last_writer); } + Iterator end() const { return Iterator(nullptr, nullptr); } + }; + + // Information kept for every waiting writer. + struct Writer { + WriteBatch* batch; + bool sync; + bool no_slowdown; + bool disable_wal; + bool disable_memtable; + size_t batch_cnt; // if non-zero, number of sub-batches in the write batch + PreReleaseCallback* pre_release_callback; + uint64_t log_used; // log number that this batch was inserted into + uint64_t log_ref; // log number that memtable insert should reference + WriteCallback* callback; + bool made_waitable; // records lazy construction of mutex and cv + std::atomic state; // write under StateMutex() or pre-link + WriteGroup* write_group; + SequenceNumber sequence; // the sequence number to use for the first key + Status status; + Status callback_status; // status returned by callback->Callback() + + std::aligned_storage::type state_mutex_bytes; + std::aligned_storage::type state_cv_bytes; + Writer* link_older; // read/write only before linking, or as leader + Writer* link_newer; // lazy, read/write only before linking, or as leader + + Writer() + : batch(nullptr), + sync(false), + no_slowdown(false), + disable_wal(false), + disable_memtable(false), + batch_cnt(0), + pre_release_callback(nullptr), + log_used(0), + log_ref(0), + callback(nullptr), + made_waitable(false), + state(STATE_INIT), + write_group(nullptr), + sequence(kMaxSequenceNumber), + link_older(nullptr), + link_newer(nullptr) {} + + Writer(const WriteOptions& write_options, WriteBatch* _batch, + WriteCallback* _callback, uint64_t _log_ref, bool _disable_memtable, + size_t _batch_cnt = 0, + PreReleaseCallback* _pre_release_callback = nullptr) + : batch(_batch), + sync(write_options.sync), + no_slowdown(write_options.no_slowdown), + disable_wal(write_options.disableWAL), + disable_memtable(_disable_memtable), + batch_cnt(_batch_cnt), + pre_release_callback(_pre_release_callback), + log_used(0), + log_ref(_log_ref), + callback(_callback), + made_waitable(false), + state(STATE_INIT), + write_group(nullptr), + sequence(kMaxSequenceNumber), + link_older(nullptr), + link_newer(nullptr) {} + + ~Writer() { + if (made_waitable) { + StateMutex().~mutex(); + StateCV().~condition_variable(); + } + } + + bool CheckCallback(DB* db) { + if (callback != nullptr) { + callback_status = callback->Callback(db); + } + return callback_status.ok(); + } + + void CreateMutex() { + if (!made_waitable) { + // Note that made_waitable is tracked separately from state + // transitions, because we can't atomically create the mutex and + // link into the list. + made_waitable = true; + new (&state_mutex_bytes) std::mutex; + new (&state_cv_bytes) std::condition_variable; + } + } + + // returns the aggregate status of this Writer + Status FinalStatus() { + if (!status.ok()) { + // a non-ok memtable write status takes presidence + assert(callback == nullptr || callback_status.ok()); + return status; + } else if (!callback_status.ok()) { + // if the callback failed then that is the status we want + // because a memtable insert should not have been attempted + assert(callback != nullptr); + assert(status.ok()); + return callback_status; + } else { + // if there is no callback then we only care about + // the memtable insert status + assert(callback == nullptr || callback_status.ok()); + return status; + } + } + + bool CallbackFailed() { + return (callback != nullptr) && !callback_status.ok(); + } + + bool ShouldWriteToMemtable() { + return status.ok() && !CallbackFailed() && !disable_memtable; + } + + bool ShouldWriteToWAL() { + return status.ok() && !CallbackFailed() && !disable_wal; + } + + // No other mutexes may be acquired while holding StateMutex(), it is + // always last in the order + std::mutex& StateMutex() { + assert(made_waitable); + return *static_cast(static_cast(&state_mutex_bytes)); + } + + std::condition_variable& StateCV() { + assert(made_waitable); + return *static_cast( + static_cast(&state_cv_bytes)); + } + }; + + struct AdaptationContext { + const char* name; + std::atomic value; + + explicit AdaptationContext(const char* name0) : name(name0), value(0) {} + }; + + explicit WriteThread(const ImmutableDBOptions& db_options); + + virtual ~WriteThread() = default; + + // IMPORTANT: None of the methods in this class rely on the db mutex + // for correctness. All of the methods except JoinBatchGroup and + // EnterUnbatched may be called either with or without the db mutex held. + // Correctness is maintained by ensuring that only a single thread is + // a leader at a time. + + // Registers w as ready to become part of a batch group, waits until the + // caller should perform some work, and returns the current state of the + // writer. If w has become the leader of a write batch group, returns + // STATE_GROUP_LEADER. If w has been made part of a sequential batch + // group and the leader has performed the write, returns STATE_DONE. + // If w has been made part of a parallel batch group and is responsible + // for updating the memtable, returns STATE_PARALLEL_FOLLOWER. + // + // The db mutex SHOULD NOT be held when calling this function, because + // it will block. + // + // Writer* w: Writer to be executed as part of a batch group + void JoinBatchGroup(Writer* w); + + // Constructs a write batch group led by leader, which should be a + // Writer passed to JoinBatchGroup on the current thread. + // + // Writer* leader: Writer that is STATE_GROUP_LEADER + // WriteGroup* write_group: Out-param of group members + // returns: Total batch group byte size + size_t EnterAsBatchGroupLeader(Writer* leader, WriteGroup* write_group); + + // Unlinks the Writer-s in a batch group, wakes up the non-leaders, + // and wakes up the next leader (if any). + // + // WriteGroup* write_group: the write group + // Status status: Status of write operation + void ExitAsBatchGroupLeader(WriteGroup& write_group, Status status); + + // Exit batch group on behalf of batch group leader. + void ExitAsBatchGroupFollower(Writer* w); + + // Constructs a write batch group led by leader from newest_memtable_writers_ + // list. The leader should either write memtable for the whole group and + // call ExitAsMemTableWriter, or launch parallel memtable write through + // LaunchParallelMemTableWriters. + void EnterAsMemTableWriter(Writer* leader, WriteGroup* write_grup); + + // Memtable writer group leader, or the last finished writer in a parallel + // write group, exit from the newest_memtable_writers_ list, and wake up + // the next leader if needed. + void ExitAsMemTableWriter(Writer* self, WriteGroup& write_group); + + // Causes JoinBatchGroup to return STATE_PARALLEL_FOLLOWER for all of the + // non-leader members of this write batch group. Sets Writer::sequence + // before waking them up. + // + // WriteGroup* write_group: Extra state used to coordinate the parallel add + void LaunchParallelMemTableWriters(WriteGroup* write_group); + + // Reports the completion of w's batch to the parallel group leader, and + // waits for the rest of the parallel batch to complete. Returns true + // if this thread is the last to complete, and hence should advance + // the sequence number and then call EarlyExitParallelGroup, false if + // someone else has already taken responsibility for that. + bool CompleteParallelMemTableWriter(Writer* w); + + // Waits for all preceding writers (unlocking mu while waiting), then + // registers w as the currently proceeding writer. + // + // Writer* w: A Writer not eligible for batching + // InstrumentedMutex* mu: The db mutex, to unlock while waiting + // REQUIRES: db mutex held + void EnterUnbatched(Writer* w, InstrumentedMutex* mu); + + // Completes a Writer begun with EnterUnbatched, unblocking subsequent + // writers. + void ExitUnbatched(Writer* w); + + // Wait for all parallel memtable writers to finish, in case pipelined + // write is enabled. + void WaitForMemTableWriters(); + + SequenceNumber UpdateLastSequence(SequenceNumber sequence) { + if (sequence > last_sequence_) { + last_sequence_ = sequence; + } + return last_sequence_; + } + + // Insert a dummy writer at the tail of the write queue to indicate a write + // stall, and fail any writers in the queue with no_slowdown set to true + void BeginWriteStall(); + + // Remove the dummy writer and wake up waiting writers + void EndWriteStall(); + + private: + // See AwaitState. + const uint64_t max_yield_usec_; + const uint64_t slow_yield_usec_; + + // Allow multiple writers write to memtable concurrently. + const bool allow_concurrent_memtable_write_; + + // Enable pipelined write to WAL and memtable. + const bool enable_pipelined_write_; + + // The maximum limit of number of bytes that are written in a single batch + // of WAL or memtable write. It is followed when the leader write size + // is larger than 1/8 of this limit. + const uint64_t max_write_batch_group_size_bytes; + + // Points to the newest pending writer. Only leader can remove + // elements, adding can be done lock-free by anybody. + std::atomic newest_writer_; + + // Points to the newest pending memtable writer. Used only when pipelined + // write is enabled. + std::atomic newest_memtable_writer_; + + // The last sequence that have been consumed by a writer. The sequence + // is not necessary visible to reads because the writer can be ongoing. + SequenceNumber last_sequence_; + + // A dummy writer to indicate a write stall condition. This will be inserted + // at the tail of the writer queue by the leader, so newer writers can just + // check for this and bail + Writer write_stall_dummy_; + + // Mutex and condvar for writers to block on a write stall. During a write + // stall, writers with no_slowdown set to false will wait on this rather + // on the writer queue + port::Mutex stall_mu_; + port::CondVar stall_cv_; + + // Waits for w->state & goal_mask using w->StateMutex(). Returns + // the state that satisfies goal_mask. + uint8_t BlockingAwaitState(Writer* w, uint8_t goal_mask); + + // Blocks until w->state & goal_mask, returning the state value + // that satisfied the predicate. Uses ctx to adaptively use + // std::this_thread::yield() to avoid mutex overheads. ctx should be + // a context-dependent static. + uint8_t AwaitState(Writer* w, uint8_t goal_mask, AdaptationContext* ctx); + + // Set writer state and wake the writer up if it is waiting. + void SetState(Writer* w, uint8_t new_state); + + // Links w into the newest_writer list. Return true if w was linked directly + // into the leader position. Safe to call from multiple threads without + // external locking. + bool LinkOne(Writer* w, std::atomic* newest_writer); + + // Link write group into the newest_writer list as a whole, while keeping the + // order of the writers unchanged. Return true if the group was linked + // directly into the leader position. + bool LinkGroup(WriteGroup& write_group, std::atomic* newest_writer); + + // Computes any missing link_newer links. Should not be called + // concurrently with itself. + void CreateMissingNewerLinks(Writer* head); + + // Starting from a pending writer, follow link_older to search for next + // leader, until we hit boundary. + Writer* FindNextLeader(Writer* pending_writer, Writer* boundary); + + // Set the leader in write_group to completed state and remove it from the + // write group. + void CompleteLeader(WriteGroup& write_group); + + // Set a follower in write_group to completed state and remove it from the + // write group. + void CompleteFollower(Writer* w, WriteGroup& write_group); +}; + +} // namespace ROCKSDB_NAMESPACE -- cgit v1.2.3